add rss & crawl & webhook

This commit is contained in:
glidea
2025-06-05 23:29:37 +08:00
parent ead8286a48
commit d520444e9f
43 changed files with 1757 additions and 703 deletions

View File

@@ -37,7 +37,6 @@ import (
telemetry "github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
jsonschema "github.com/glidea/zenfeed/pkg/util/json_schema"
"github.com/glidea/zenfeed/pkg/util/rpc"
)
// --- Interface code block ---
@@ -161,11 +160,11 @@ type QueryRequest struct {
}
func (r *QueryRequest) Validate() error { //nolint:cyclop
if r.Query != "" && utf8.RuneCountInString(r.Query) < 5 {
return errors.New("query must be at least 5 characters")
if r.Query != "" && utf8.RuneCountInString(r.Query) > 64 {
return errors.New("query must be at most 64 characters")
}
if r.Threshold == 0 {
r.Threshold = 0.55
r.Threshold = 0.5
}
if r.Threshold < 0 || r.Threshold > 1 {
return errors.New("threshold must be between 0 and 1")
@@ -200,6 +199,28 @@ type QueryResponse struct {
Count int `json:"count"`
}
type Error struct {
Code int `json:"code"`
Message string `json:"message"`
}
func (e Error) Error() string {
return e.Message
}
func newError(code int, err error) Error {
return Error{
Code: code,
Message: err.Error(),
}
}
var (
ErrBadRequest = func(err error) Error { return newError(http.StatusBadRequest, err) }
ErrNotFound = func(err error) Error { return newError(http.StatusNotFound, err) }
ErrInternal = func(err error) Error { return newError(http.StatusInternalServerError, err) }
)
// --- Factory code block ---
type Factory component.Factory[API, config.App, Dependencies]
@@ -262,7 +283,7 @@ func (a *api) QueryAppConfigSchema(
) (resp *QueryAppConfigSchemaResponse, err error) {
schema, err := jsonschema.ForType(reflect.TypeOf(config.App{}))
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query app config schema"))
return nil, ErrInternal(errors.Wrap(err, "query app config schema"))
}
return (*QueryAppConfigSchemaResponse)(&schema), nil
@@ -282,7 +303,7 @@ func (a *api) ApplyAppConfig(
req *ApplyAppConfigRequest,
) (resp *ApplyAppConfigResponse, err error) {
if err := a.Dependencies().ConfigManager.SaveAppConfig(&req.App); err != nil {
return nil, rpc.ErrBadRequest(errors.Wrap(err, "save app config"))
return nil, ErrBadRequest(errors.Wrap(err, "save app config"))
}
return &ApplyAppConfigResponse{}, nil
@@ -297,20 +318,20 @@ func (a *api) QueryRSSHubCategories(
// New request.
forwardReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "new request"))
return nil, ErrInternal(errors.Wrap(err, "new request"))
}
// Do request.
forwardRespIO, err := a.hc.Do(forwardReq)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query rss hub websites"))
return nil, ErrInternal(errors.Wrap(err, "query rss hub websites"))
}
defer func() { _ = forwardRespIO.Body.Close() }()
// Parse response.
var forwardResp map[string]RSSHubWebsite
if err := json.NewDecoder(forwardRespIO.Body).Decode(&forwardResp); err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "parse response"))
return nil, ErrInternal(errors.Wrap(err, "parse response"))
}
// Convert to response.
@@ -333,7 +354,7 @@ func (a *api) QueryRSSHubWebsites(
ctx context.Context, req *QueryRSSHubWebsitesRequest,
) (resp *QueryRSSHubWebsitesResponse, err error) {
if req.Category == "" {
return nil, rpc.ErrBadRequest(errors.New("category is required"))
return nil, ErrBadRequest(errors.New("category is required"))
}
url := a.Config().RSSHubEndpoint + "/api/category/" + req.Category
@@ -341,29 +362,29 @@ func (a *api) QueryRSSHubWebsites(
// New request.
forwardReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "new request"))
return nil, ErrInternal(errors.Wrap(err, "new request"))
}
// Do request.
forwardRespIO, err := a.hc.Do(forwardReq)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query rss hub routes"))
return nil, ErrInternal(errors.Wrap(err, "query rss hub routes"))
}
defer func() { _ = forwardRespIO.Body.Close() }()
// Parse response.
body, err := io.ReadAll(forwardRespIO.Body)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "read response"))
return nil, ErrInternal(errors.Wrap(err, "read response"))
}
if len(body) == 0 {
// Hack for RSSHub...
// Consider cache category ids for validate by self to remove this shit code.
return nil, rpc.ErrBadRequest(errors.New("category id is invalid"))
return nil, ErrBadRequest(errors.New("category id is invalid"))
}
var forwardResp map[string]RSSHubWebsite
if err := json.Unmarshal(body, &forwardResp); err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "parse response"))
return nil, ErrInternal(errors.Wrap(err, "parse response"))
}
// Convert to response.
@@ -383,7 +404,7 @@ func (a *api) QueryRSSHubRoutes(
req *QueryRSSHubRoutesRequest,
) (resp *QueryRSSHubRoutesResponse, err error) {
if req.WebsiteID == "" {
return nil, rpc.ErrBadRequest(errors.New("website id is required"))
return nil, ErrBadRequest(errors.New("website id is required"))
}
url := a.Config().RSSHubEndpoint + "/api/namespace/" + req.WebsiteID
@@ -391,30 +412,30 @@ func (a *api) QueryRSSHubRoutes(
// New request.
forwardReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "new request"))
return nil, ErrInternal(errors.Wrap(err, "new request"))
}
// Do request.
forwardRespIO, err := a.hc.Do(forwardReq)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query rss hub routes"))
return nil, ErrInternal(errors.Wrap(err, "query rss hub routes"))
}
defer func() { _ = forwardRespIO.Body.Close() }()
// Parse response.
body, err := io.ReadAll(forwardRespIO.Body)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "read response"))
return nil, ErrInternal(errors.Wrap(err, "read response"))
}
if len(body) == 0 {
return nil, rpc.ErrBadRequest(errors.New("website id is invalid"))
return nil, ErrBadRequest(errors.New("website id is invalid"))
}
var forwardResp struct {
Routes map[string]RSSHubRoute `json:"routes"`
}
if err := json.Unmarshal(body, &forwardResp); err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "parse response"))
return nil, ErrInternal(errors.Wrap(err, "parse response"))
}
// Convert to response.
@@ -435,7 +456,7 @@ func (a *api) Write(ctx context.Context, req *WriteRequest) (resp *WriteResponse
feed.Labels.Put(model.LabelType, "api", false)
}
if err := a.Dependencies().FeedStorage.Append(ctx, req.Feeds...); err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "append"))
return nil, ErrInternal(errors.Wrap(err, "append"))
}
return &WriteResponse{}, nil
@@ -447,7 +468,7 @@ func (a *api) Query(ctx context.Context, req *QueryRequest) (resp *QueryResponse
// Validate request.
if err := req.Validate(); err != nil {
return nil, rpc.ErrBadRequest(errors.Wrap(err, "validate"))
return nil, ErrBadRequest(errors.Wrap(err, "validate"))
}
// Forward to storage.
@@ -460,7 +481,7 @@ func (a *api) Query(ctx context.Context, req *QueryRequest) (resp *QueryResponse
End: req.End,
})
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query"))
return nil, ErrInternal(errors.Wrap(err, "query"))
}
if len(feeds) == 0 {
return &QueryResponse{Feeds: []*block.FeedVO{}}, nil

View File

@@ -26,9 +26,8 @@ import (
"github.com/glidea/zenfeed/pkg/config"
telemetry "github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
"github.com/glidea/zenfeed/pkg/telemetry/metric"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
"github.com/glidea/zenfeed/pkg/util/rpc"
"github.com/glidea/zenfeed/pkg/util/jsonrpc"
)
// --- Interface code block ---
@@ -89,18 +88,14 @@ func new(instance string, app *config.App, dependencies Dependencies) (Server, e
router := http.NewServeMux()
api := dependencies.API
router.Handle("/metrics", metric.Handler())
router.Handle("/health", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
}))
router.Handle("/write", rpc.API(api.Write))
router.Handle("/query_config", rpc.API(api.QueryAppConfig))
router.Handle("/apply_config", rpc.API(api.ApplyAppConfig))
router.Handle("/query_config_schema", rpc.API(api.QueryAppConfigSchema))
router.Handle("/query_rsshub_categories", rpc.API(api.QueryRSSHubCategories))
router.Handle("/query_rsshub_websites", rpc.API(api.QueryRSSHubWebsites))
router.Handle("/query_rsshub_routes", rpc.API(api.QueryRSSHubRoutes))
router.Handle("/query", rpc.API(api.Query))
router.Handle("/write", jsonrpc.API(api.Write))
router.Handle("/query_config", jsonrpc.API(api.QueryAppConfig))
router.Handle("/apply_config", jsonrpc.API(api.ApplyAppConfig))
router.Handle("/query_config_schema", jsonrpc.API(api.QueryAppConfigSchema))
router.Handle("/query_rsshub_categories", jsonrpc.API(api.QueryRSSHubCategories))
router.Handle("/query_rsshub_websites", jsonrpc.API(api.QueryRSSHubWebsites))
router.Handle("/query_rsshub_routes", jsonrpc.API(api.QueryRSSHubRoutes))
router.Handle("/query", jsonrpc.API(api.Query))
httpServer := &http.Server{Addr: config.Address, Handler: router}
return &server{

231
pkg/api/rss/rss.go Normal file
View File

@@ -0,0 +1,231 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package rss
import (
"fmt"
"net"
"net/http"
"text/template"
"time"
"github.com/benbjohnson/clock"
"github.com/gorilla/feeds"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/api"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
"github.com/glidea/zenfeed/pkg/model"
telemetry "github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
"github.com/glidea/zenfeed/pkg/util/buffer"
)
var clk = clock.New()
// --- Interface code block ---
type Server interface {
component.Component
config.Watcher
}
type Config struct {
Address string
ContentHTMLTemplate string
contentHTMLTemplate *template.Template
}
func (c *Config) Validate() error {
if c.Address == "" {
c.Address = ":1302"
}
if _, _, err := net.SplitHostPort(c.Address); err != nil {
return errors.Wrap(err, "invalid address")
}
if c.ContentHTMLTemplate == "" {
c.ContentHTMLTemplate = "{{ .summary_html_snippet }}"
}
t, err := template.New("").Parse(c.ContentHTMLTemplate)
if err != nil {
return errors.Wrap(err, "parse rss content template")
}
c.contentHTMLTemplate = t
return nil
}
func (c *Config) From(app *config.App) *Config {
c.Address = app.API.RSS.Address
c.ContentHTMLTemplate = app.API.RSS.ContentHTMLTemplate
return c
}
type Dependencies struct {
API api.API
}
// --- Factory code block ---
type Factory component.Factory[Server, config.App, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[Server, config.App, Dependencies](
func(instance string, config *config.App, dependencies Dependencies) (Server, error) {
m := &mockServer{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
},
)
}
return component.FactoryFunc[Server, config.App, Dependencies](new)
}
func new(instance string, app *config.App, dependencies Dependencies) (Server, error) {
config := &Config{}
config.From(app)
if err := config.Validate(); err != nil {
return nil, errors.Wrap(err, "validate config")
}
s := &server{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "RSSServer",
Instance: instance,
Config: config,
Dependencies: dependencies,
}),
}
router := http.NewServeMux()
router.Handle("/", http.HandlerFunc(s.rss))
s.http = &http.Server{Addr: config.Address, Handler: router}
return s, nil
}
// --- Implementation code block ---
type server struct {
*component.Base[Config, Dependencies]
http *http.Server
}
func (s *server) Run() (err error) {
ctx := telemetry.StartWith(s.Context(), append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Run")...)
defer func() { telemetry.End(ctx, err) }()
serverErr := make(chan error, 1)
go func() {
serverErr <- s.http.ListenAndServe()
}()
s.MarkReady()
select {
case <-ctx.Done():
log.Info(ctx, "shutting down")
return s.http.Shutdown(ctx)
case err := <-serverErr:
return errors.Wrap(err, "listen and serve")
}
}
func (s *server) Reload(app *config.App) error {
newConfig := &Config{}
newConfig.From(app)
if err := newConfig.Validate(); err != nil {
return errors.Wrap(err, "validate config")
}
if s.Config().Address != newConfig.Address {
return errors.New("address cannot be reloaded")
}
s.SetConfig(newConfig)
return nil
}
func (s *server) rss(w http.ResponseWriter, r *http.Request) {
var err error
ctx := telemetry.StartWith(r.Context(), append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "rss")...)
defer telemetry.End(ctx, err)
// Extract parameters.
ps := r.URL.Query()
labelFilters := ps["label_filter"]
query := ps.Get("query")
// Forward query request to API.
now := clk.Now()
queryResult, err := s.Dependencies().API.Query(ctx, &api.QueryRequest{
Query: query,
LabelFilters: labelFilters,
Start: now.Add(-24 * time.Hour),
End: now,
Limit: 100,
})
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest) // TODO: standardize error handling.
return
}
// Render and convert to RSS.
rssObj := &feeds.Feed{
Title: fmt.Sprintf("Zenfeed RSS - %s", ps.Encode()),
Description: "Powered by Github Zenfeed - https://github.com/glidea/zenfeed. If you use Folo, please enable 'Appearance - Content - Render inline styles'",
Items: make([]*feeds.Item, 0, len(queryResult.Feeds)),
}
buf := buffer.Get()
defer buffer.Put(buf)
for _, feed := range queryResult.Feeds {
buf.Reset()
if err = s.Config().contentHTMLTemplate.Execute(buf, feed.Labels.Map()); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
item := &feeds.Item{
Title: feed.Labels.Get(model.LabelTitle),
Link: &feeds.Link{Href: feed.Labels.Get(model.LabelLink)},
Created: feed.Time, // NOTE: scrape time, not pub time.
Content: buf.String(),
}
rssObj.Items = append(rssObj.Items, item)
}
if err = rssObj.WriteRss(w); err != nil {
log.Error(ctx, errors.Wrap(err, "write rss response"))
return
}
}
type mockServer struct {
component.Mock
}
func (m *mockServer) Reload(app *config.App) error {
return m.Called(app).Error(0)
}

View File

@@ -46,10 +46,13 @@ type Config struct {
}
type App struct {
Timezone string `yaml:"timezone,omitempty" json:"timezone,omitempty" desc:"The timezone of the app. e.g. Asia/Shanghai. Default: server's local timezone"`
Log struct {
Level string `yaml:"level,omitempty" json:"level,omitempty" desc:"Log level, one of debug, info, warn, error. Default: info"`
} `yaml:"log,omitempty" json:"log,omitempty" desc:"The log config."`
Timezone string `yaml:"timezone,omitempty" json:"timezone,omitempty" desc:"The timezone of the app. e.g. Asia/Shanghai. Default: server's local timezone"`
Telemetry struct {
Address string `yaml:"address,omitempty" json:"address,omitempty" desc:"The address ([host]:port) of the telemetry server. e.g. 0.0.0.0:9090. Default: :9090. It can not be changed after the app is running."`
Log struct {
Level string `yaml:"level,omitempty" json:"level,omitempty" desc:"Log level, one of debug, info, warn, error. Default: info"`
} `yaml:"log,omitempty" json:"log,omitempty" desc:"The log config."`
} `yaml:"telemetry,omitempty" json:"telemetry,omitempty" desc:"The telemetry config."`
API struct {
HTTP struct {
Address string `yaml:"address,omitempty" json:"address,omitempty" desc:"The address ([host]:port) of the HTTP API. e.g. 0.0.0.0:1300. Default: :1300. It can not be changed after the app is running."`
@@ -57,9 +60,16 @@ type App struct {
MCP struct {
Address string `yaml:"address,omitempty" json:"address,omitempty" desc:"The address ([host]:port) of the MCP API. e.g. 0.0.0.0:1300. Default: :1301. It can not be changed after the app is running."`
} `yaml:"mcp,omitempty" json:"mcp,omitempty" desc:"The MCP API config."`
RSS struct {
Address string `yaml:"address,omitempty" json:"address,omitempty" desc:"The address ([host]:port) of the RSS API. e.g. 0.0.0.0:1300. Default: :1302. It can not be changed after the app is running."`
ContentHTMLTemplate string `yaml:"content_html_template,omitempty" json:"content_html_template,omitempty" desc:"The template to render the RSS content for each item. Default is {{ .summary_html_snippet }}."`
} `yaml:"rss,omitempty" json:"rss,omitempty" desc:"The RSS config."`
LLM string `yaml:"llm,omitempty" json:"llm,omitempty" desc:"The LLM name for summarizing feeds. e.g. my-favorite-gemini-king. Default is the default LLM in llms section."`
} `yaml:"api,omitempty" json:"api,omitempty" desc:"The API config."`
LLMs []LLM `yaml:"llms,omitempty" json:"llms,omitempty" desc:"The LLMs config. It is required, at least one LLM is needed, refered by other config sections."`
LLMs []LLM `yaml:"llms,omitempty" json:"llms,omitempty" desc:"The LLMs config. It is required, at least one LLM is needed, refered by other config sections."`
Jina struct {
Token string `yaml:"token,omitempty" json:"token,omitempty" desc:"The token of the Jina server."`
} `yaml:"jina,omitempty" json:"jina,omitempty" desc:"The Jina config."`
Scrape Scrape `yaml:"scrape,omitempty" json:"scrape,omitempty" desc:"The scrape config."`
Storage Storage `yaml:"storage,omitempty" json:"storage,omitempty" desc:"The storage config."`
Scheduls struct {
@@ -116,6 +126,7 @@ type ScrapeSourceRSS struct {
}
type RewriteRule struct {
If []string `yaml:"if,omitempty" json:"if,omitempty" desc:"The condition config to match the feed. If not set, that means match all feeds. Like label filters, e.g. [source=github, title!=xxx]"`
SourceLabel string `yaml:"source_label,omitempty" json:"source_label,omitempty" desc:"The feed label of the source text to transform. Default is the 'content' label. The feed is essentially a label set (similar to Prometheus metric data). The default labels are type (rss, email (in future), etc), source (the source name), title (feed title), link (feed link), pub_time (feed publish time), and content (feed content)."`
SkipTooShortThreshold *int `yaml:"skip_too_short_threshold,omitempty" json:"skip_too_short_threshold,omitempty" desc:"The threshold of the source text length to skip. Default is 300. It helps we to filter out some short feeds."`
Transform *RewriteRuleTransform `yaml:"transform,omitempty" json:"transform,omitempty" desc:"The transform config to transform the source text. If not set, that means transform nothing, so the source text is the transformed text."`
@@ -130,6 +141,7 @@ type RewriteRuleTransform struct {
}
type RewriteRuleTransformToText struct {
Type string `yaml:"type,omitempty" json:"type,omitempty" desc:"The type of the transform. It can be one of prompt, crawl, crawl_by_jina. Default is prompt. For crawl, the source text will be as the url to crawl the page, and the page will be converted to markdown. crawl vs crawl_by_jina: crawl is local, more stable; crawl_by_jina is powered by https://jina.ai, more powerful."`
LLM string `yaml:"llm,omitempty" json:"llm,omitempty" desc:"The LLM name to use. Default is the default LLM in llms section."`
Prompt string `yaml:"prompt,omitempty" json:"prompt,omitempty" desc:"The prompt to transform the source text. The source text will be injected into the prompt above. And you can use go template syntax to refer some built-in prompts, like {{ .summary }}. Available built-in prompts: category, tags, score, comment_confucius, summary, summary_html_snippet."`
}
@@ -166,15 +178,14 @@ type NotifySubRoute struct {
}
type NotifyReceiver struct {
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the receiver. It is required."`
Email string `yaml:"email,omitempty" json:"email,omitempty" desc:"The email of the receiver."`
// TODO: to reduce copyright risk, we do not support webhook receiver now.
// Webhook *NotifyReceiverWebhook `yaml:"webhook" json:"webhook" desc:"The webhook of the receiver."`
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the receiver. It is required."`
Email string `yaml:"email,omitempty" json:"email,omitempty" desc:"The email of the receiver."`
Webhook *NotifyReceiverWebhook `yaml:"webhook" json:"webhook" desc:"The webhook of the receiver."`
}
// type NotifyReceiverWebhook struct {
// URL string `yaml:"url"`
// }
type NotifyReceiverWebhook struct {
URL string `yaml:"url"`
}
type NotifyChannels struct {
Email *NotifyChannelEmail `yaml:"email,omitempty" json:"email,omitempty" desc:"The global email channel config."`

View File

@@ -16,6 +16,7 @@
package llm
import (
"bytes"
"context"
"reflect"
"strconv"
@@ -33,6 +34,8 @@ import (
"github.com/glidea/zenfeed/pkg/storage/kv"
"github.com/glidea/zenfeed/pkg/telemetry/log"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
binaryutil "github.com/glidea/zenfeed/pkg/util/binary"
"github.com/glidea/zenfeed/pkg/util/buffer"
"github.com/glidea/zenfeed/pkg/util/hash"
)
@@ -373,24 +376,94 @@ func newCached(llm LLM, kvStorage kv.Storage) LLM {
func (c *cached) String(ctx context.Context, messages []string) (string, error) {
key := hash.Sum64s(messages)
keyStr := strconv.FormatUint(key, 10)
keyStr := strconv.FormatUint(key, 10) // for human readable & compatible.
value, err := c.kvStorage.Get(ctx, keyStr)
valueBs, err := c.kvStorage.Get(ctx, []byte(keyStr))
switch {
case err == nil:
return value, nil
return string(valueBs), nil
case errors.Is(err, kv.ErrNotFound):
break
default:
return "", errors.Wrap(err, "get from kv storage")
}
value, err = c.LLM.String(ctx, messages)
value, err := c.LLM.String(ctx, messages)
if err != nil {
return "", err
}
if err = c.kvStorage.Set(ctx, keyStr, value, 65*time.Minute); err != nil {
// TODO: reduce copies.
if err = c.kvStorage.Set(ctx, []byte(keyStr), []byte(value), 65*time.Minute); err != nil {
log.Error(ctx, err, "set to kv storage")
}
return value, nil
}
var (
toBytes = func(v []float32) ([]byte, error) {
buf := buffer.Get()
defer buffer.Put(buf)
for _, fVal := range v {
if err := binaryutil.WriteFloat32(buf, fVal); err != nil {
return nil, errors.Wrap(err, "write float32")
}
}
// Must copy data, as the buffer will be reused.
bs := make([]byte, buf.Len())
copy(bs, buf.Bytes())
return bs, nil
}
toF32s = func(bs []byte) ([]float32, error) {
if len(bs)%4 != 0 {
return nil, errors.New("embedding data is corrupted, length not multiple of 4")
}
r := bytes.NewReader(bs)
floats := make([]float32, len(bs)/4)
for i := range floats {
f, err := binaryutil.ReadFloat32(r)
if err != nil {
return nil, errors.Wrap(err, "deserialize float32")
}
floats[i] = f
}
return floats, nil
}
)
func (c *cached) Embedding(ctx context.Context, text string) ([]float32, error) {
key := hash.Sum64(text)
keyStr := strconv.FormatUint(key, 10)
valueBs, err := c.kvStorage.Get(ctx, []byte(keyStr))
switch {
case err == nil:
return toF32s(valueBs)
case errors.Is(err, kv.ErrNotFound):
break
default:
return nil, errors.Wrap(err, "get from kv storage")
}
value, err := c.LLM.Embedding(ctx, text)
if err != nil {
return nil, err
}
valueBs, err = toBytes(value)
if err != nil {
return nil, errors.Wrap(err, "serialize embedding")
}
if err = c.kvStorage.Set(ctx, []byte(keyStr), valueBs, 65*time.Minute); err != nil {
log.Error(ctx, err, "set to kv storage")
}

View File

@@ -18,6 +18,7 @@ package llm
import (
"context"
"encoding/json"
"fmt"
"github.com/pkg/errors"
oai "github.com/sashabaranov/go-openai"
@@ -40,7 +41,7 @@ func newOpenAI(c *Config) LLM {
config := oai.DefaultConfig(c.APIKey)
config.BaseURL = c.Endpoint
client := oai.NewClientWithConfig(config)
embeddingSpliter := newEmbeddingSpliter(2048, 64)
embeddingSpliter := newEmbeddingSpliter(1536, 64)
return &openai{
Base: component.New(&component.BaseConfig[Config, struct{}]{
@@ -61,9 +62,9 @@ func (o *openai) String(ctx context.Context, messages []string) (value string, e
if config.Model == "" {
return "", errors.New("model is not set")
}
msg := make([]oai.ChatCompletionMessage, 0, len(messages))
msgs := make([]oai.ChatCompletionMessage, 0, len(messages))
for _, m := range messages {
msg = append(msg, oai.ChatCompletionMessage{
msgs = append(msgs, oai.ChatCompletionMessage{
Role: oai.ChatMessageRoleUser,
Content: m,
})
@@ -71,7 +72,7 @@ func (o *openai) String(ctx context.Context, messages []string) (value string, e
req := oai.ChatCompletionRequest{
Model: config.Model,
Messages: msg,
Messages: msgs,
Temperature: config.Temperature,
}
@@ -131,6 +132,7 @@ func (o *openai) Embedding(ctx context.Context, s string) (value []float32, err
EncodingFormat: oai.EmbeddingEncodingFormatFloat,
})
if err != nil {
fmt.Println(s)
return nil, errors.Wrap(err, "create embeddings")
}
if len(vec.Data) == 0 {
@@ -141,6 +143,6 @@ func (o *openai) Embedding(ctx context.Context, s string) (value []float32, err
promptTokens.WithLabelValues(lvs...).Add(float64(vec.Usage.PromptTokens))
completionTokens.WithLabelValues(lvs...).Add(float64(vec.Usage.CompletionTokens))
totalTokens.WithLabelValues(lvs...).Add(float64(vec.Usage.TotalTokens))
return vec.Data[0].Embedding, nil
}

156
pkg/llm/prompt/prompt.go Normal file
View File

@@ -0,0 +1,156 @@
package prompt
var Builtin = map[string]string{
"category": `
Analyze the content and categorize it into exactly one of these categories:
Technology, Development, Entertainment, Finance, Health, Politics, Other
Classification requirements:
- Choose the SINGLE most appropriate category based on:
* Primary topic and main focus of the content
* Key terminology and concepts used
* Target audience and purpose
* Technical depth and complexity level
- For content that could fit multiple categories:
* Identify the dominant theme
* Consider the most specific applicable category
* Use the primary intended purpose
- If content appears ambiguous:
* Focus on the most prominent aspects
* Consider the practical application
* Choose the category that best serves user needs
Output format:
Return ONLY the category name, no other text or explanation.
Must be one of the provided categories exactly as written.
`,
"tags": `
Analyze the content and add appropriate tags based on:
- Main topics and themes
- Key concepts and terminology
- Target audience and purpose
- Technical depth and domain
- 2-4 tags are enough
Output format:
Return a list of tags, separated by commas, no other text or explanation.
e.g. "AI, Technology, Innovation, Future"
`,
"score": `
Please give a score between 0 and 10 based on the following content.
Evaluate the content comprehensively considering clarity, accuracy, depth, logical structure, language expression, and completeness.
Note: If the content is an article or a text intended to be detailed, the length is an important factor. Generally, content under 300 words may receive a lower score due to lack of substance, unless its type (such as poetry or summary) is inherently suitable for brevity.
Output format:
Return the score (0-10), no other text or explanation.
E.g. "8", "5", "3", etc.
`,
"comment_confucius": `
Please act as Confucius and write a 100-word comment on the article.
Content needs to be in line with the Chinese mainland's regulations.
Output format:
Return the comment only, no other text or explanation.
Reply short and concise, 100 words is enough.
`,
"summary": `
Please read the article carefully and summarize its core content in the format of [Choice: Key Point List / Concise Paragraph]. The summary should clearly cover:
1. What is the main topic/theme of the article?
2. What key arguments/main information did the author put forward?
3. (Optional, if the article contains) What important data, cases, or examples are there?
4. What main conclusions did the article reach or what core information did it ultimately convey?
Strive for comprehensive, accurate, and concise.
`,
"summary_html_snippet": `
You are to act as a professional Content Designer. Your task is to convert the provided article into **visually modern HTML email snippets** that render well in modern email clients like Gmail and QQ Mail.
**Core Requirements:**
* **Highlighting and Layout Techniques (Based on the article content, you must actually use the HTML structure templates provided below to generate the content):**
A. **Stylish Quote Block** (for highlighting important points or direct quotes from the original text):
<div style="margin:20px 0; padding:20px; background:linear-gradient(to right, #f8f9fa, #ffffff); border-left:5px solid #4285f4; border-radius:5px; box-shadow:0 2px 8px rgba(0,0,0,0.05);">
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; line-height:1.6; color:#333; font-weight:500;">
Insert the key point or finding to be highlighted here.
</p>
</div>
B. **Information Card** (for highlighting key data/metrics):
<div style="display:inline-block; margin:10px 10px 10px 0; padding:15px 20px; background-color:#ffffff; border-radius:8px; box-shadow:0 3px 10px rgba(0,0,0,0.08); min-width:120px; text-align:center;">
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#666;">Metric Name</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:24px; font-weight:600; color:#1a73e8;">75%</p>
</div>
C. **Key Points List** (for organizing multiple core points):
<ul style="margin:20px 0; padding-left:0; list-style-type:none;">
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">1</span>
Description of the first key point
</li>
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">2</span>
Description of the second key point
</li>
</ul>
D. **Emphasized Text** (for highlighting keywords or phrases):
<span style="background:linear-gradient(180deg, rgba(255,255,255,0) 50%, rgba(66,133,244,0.2) 50%); padding:0 2px;">Text to be emphasized</span>
E. **Comparison Table** (suitable for comparing different solutions or viewpoints):
<div style="margin:25px 0; padding:15px; background-color:#f8f9fa; border-radius:8px; overflow-x:auto;">
<table style="width:100%; border-collapse:collapse; font-family:'Google Sans',Roboto,Arial,sans-serif;">
<thead>
<tr>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Feature</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option A</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option B</th>
</tr>
</thead>
<tbody>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Cost</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Higher</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Moderate</td>
</tr>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Efficiency</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Very High</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Average</td>
</tr>
</tbody>
</table>
</div>
* **Output Requirements:**
* The design should be **aesthetically pleasing and elegant, with harmonious color schemes**, ensuring sufficient **whitespace and contrast**.
* All article snippets must maintain a **consistent visual style**.
* You **must use multiple visual elements** and avoid mere text listings. **Use at least 2-3 different visual elements** to enhance readability and intuitive understanding.
* **Appropriately quote important original text snippets** to support explanations.
* **Strive to use highlighting styles to mark key points**.
* **Where appropriate, embed original images from the article to aid explanation.** Pay attention to the referrer policy: use referrerpolicy="no-referrer" on the <img> HTML element to ensure images display correctly.
* **Ensure overall reading flow is smooth and natural!!!** Guide the reader's thought process appropriately, minimizing abrupt jumps in logic.
* **Output only the HTML code snippet.** Do not include the full HTML document structure (i.e., no <html>, <head>, or <body> tags).
* **Do not add any explanatory text, extra comments, Markdown formatting, or HTML backticks.** Output the raw HTML code directly.
* **Do not add article titles or sources;** these will be automatically injected by the user later.
* **Do not use any opening remarks or pleasantries** (e.g., "Hi," "Let's talk about..."). Directly present the processed HTML content.
* **Do not refer to "this article," "this piece," "the current text," etc.** The user is aware of this context.
* **Only use inline styles, do not use global styles.** Remember to only generate HTML snippets.
* Do not explain anything, just output the HTML code snippet.
* Use above HTML components & its styles to generate the HTML code snippet, do not customize by yourself, else you will be fired.
* **Your Personality and Expression Preferences:**
* Focus on the most valuable information, not on every detail. The content should be readable within 3 minutes.
* Communicate **concisely and get straight to the point.
* ** Have a strong aversion to jargon, bureaucratic language, redundant embellishments, and grand narratives. Believe that plain, simple language can best convey truth.
* Be fluent, plain, concise, and not verbose.
* Be **plain, direct, clear, and easy to understand:** Use basic vocabulary and simple sentence structures. Avoid "sophisticated" complex sentences or unnecessary embellishments that increase reading burden.
* Enable readers to quickly grasp: "What is this? What is it generally about? What is its relevance/real-world significance to me (an ordinary person)?" Focus on providing an **overview**, not an accumulation of details.
* Be well-versed in cognitive science; understand how to phrase information so that someone without prior background can quickly understand the core content.
* **Extract key information and core insights,** rather than directly copying the original text. Do not omit crucial information and viewpoints. For example, for forum posts, the main points from comments are also very important!
* Avoid large blocks of text, strive for a combination of pictures and text.
`,
}

View File

@@ -30,6 +30,7 @@ import (
const (
AppName = "zenfeed"
Module = "github.com/glidea/zenfeed"
)
// LabelXXX is the metadata label for the feed.
@@ -233,6 +234,76 @@ type Label struct {
Value string `json:"value"`
}
const (
LabelFilterEqual = "="
LabelFilterNotEqual = "!="
)
type LabelFilter struct {
Label string
Equal bool
Value string
}
func NewLabelFilter(filter string) (LabelFilter, error) {
eq := false
parts := strings.Split(filter, LabelFilterNotEqual)
if len(parts) != 2 {
parts = strings.Split(filter, LabelFilterEqual)
eq = true
}
if len(parts) != 2 {
return LabelFilter{}, errors.New("invalid label filter")
}
return LabelFilter{Label: parts[0], Value: parts[1], Equal: eq}, nil
}
func (f LabelFilter) Match(labels Labels) bool {
lv := labels.Get(f.Label)
if lv == "" {
return false
}
if f.Equal && lv == f.Value {
return true
}
if !f.Equal && lv != f.Value {
return true
}
return false
}
type LabelFilters []LabelFilter
func (ls LabelFilters) Match(labels Labels) bool {
if len(ls) == 0 {
return true // No filters, always match.
}
for _, l := range ls {
if !l.Match(labels) {
return false
}
}
return true
}
func NewLabelFilters(filters []string) (LabelFilters, error) {
ls := make(LabelFilters, len(filters))
for i, f := range filters {
lf, err := NewLabelFilter(f)
if err != nil {
return nil, errors.Wrapf(err, "new label filter %q", f)
}
ls[i] = lf
}
return ls, nil
}
// readExpectedDelim reads the next token and checks if it's the expected delimiter.
func readExpectedDelim(dec *json.Decoder, expected json.Delim) error {
t, err := dec.Token()

View File

@@ -124,10 +124,9 @@ func (c *aggrChannel) Send(ctx context.Context, receiver Receiver, group *route.
if receiver.Email != "" && c.email != nil {
return c.send(ctx, receiver, group, c.email, "email")
}
// if receiver.Webhook != nil && c.webhook != nil {
// TODO: temporarily disable webhook to reduce copyright risks.
// return c.send(ctx, receiver, group, c.webhook, "webhook")
// }
if receiver.Webhook != nil && c.webhook != nil {
return c.send(ctx, receiver, group, c.webhook, "webhook")
}
return nil
}

View File

@@ -134,53 +134,53 @@ func (e *email) buildEmail(receiver Receiver, group *route.FeedGroup) (*gomail.M
if err != nil {
return nil, errors.Wrap(err, "build email body HTML")
}
m.SetBody("text/html", string(body))
m.SetBody("text/html", body)
return m, nil
}
func (e *email) buildBodyHTML(group *route.FeedGroup) ([]byte, error) {
func (e *email) buildBodyHTML(group *route.FeedGroup) (string, error) {
bodyBuf := buffer.Get()
defer buffer.Put(bodyBuf)
// Write HTML header.
if err := e.writeHTMLHeader(bodyBuf); err != nil {
return nil, errors.Wrap(err, "write HTML header")
return "", errors.Wrap(err, "write HTML header")
}
// Write summary.
if err := e.writeSummary(bodyBuf, group.Summary); err != nil {
return nil, errors.Wrap(err, "write summary")
return "", errors.Wrap(err, "write summary")
}
// Write each feed content.
if _, err := bodyBuf.WriteString(`
<div style="margin-top:20px; padding-top:15px; border-top:1px solid #f1f3f4;">
<p style="font-size:32px; font-weight:500; margin:0 0 10px 0;">Feeds</p>`); err != nil {
return nil, errors.Wrap(err, "write feeds header")
return "", errors.Wrap(err, "write feeds header")
}
for i, feed := range group.Feeds {
if err := e.writeFeedContent(bodyBuf, feed); err != nil {
return nil, errors.Wrap(err, "write feed content")
return "", errors.Wrap(err, "write feed content")
}
// Add separator (except the last feed).
if i < len(group.Feeds)-1 {
if err := e.writeSeparator(bodyBuf); err != nil {
return nil, errors.Wrap(err, "write separator")
return "", errors.Wrap(err, "write separator")
}
}
}
// Write disclaimer and HTML footer.
if err := e.writeDisclaimer(bodyBuf); err != nil {
return nil, errors.Wrap(err, "write disclaimer")
return "", errors.Wrap(err, "write disclaimer")
}
if err := e.writeHTMLFooter(bodyBuf); err != nil {
return nil, errors.Wrap(err, "write HTML footer")
return "", errors.Wrap(err, "write HTML footer")
}
return bodyBuf.Bytes(), nil
return bodyBuf.String(), nil
}
func (e *email) writeHTMLHeader(buf *buffer.Bytes) error {

View File

@@ -41,9 +41,10 @@ func (r *WebhookReceiver) Validate() error {
}
type webhookBody struct {
Group string `json:"group"`
Labels model.Labels `json:"labels"`
Feeds []*route.Feed `json:"feeds"`
Group string `json:"group"`
Labels model.Labels `json:"labels"`
Summary string `json:"summary"`
Feeds []*route.Feed `json:"feeds"`
}
func newWebhook() sender {
@@ -59,9 +60,10 @@ type webhook struct {
func (w *webhook) Send(ctx context.Context, receiver Receiver, group *route.FeedGroup) error {
// Prepare request.
body := &webhookBody{
Group: group.Name,
Labels: group.Labels,
Feeds: group.Feeds,
Group: group.Name,
Labels: group.Labels,
Summary: group.Summary,
Feeds: group.Feeds,
}
b := runtimeutil.Must1(json.Marshal(body))
req, err := http.NewRequestWithContext(ctx, http.MethodPost, receiver.Webhook.URL, bytes.NewReader(b))

View File

@@ -86,9 +86,9 @@ func (c *Config) From(app *config.App) *Config {
if app.Notify.Receivers[i].Email != "" {
c.Receivers[i].Email = app.Notify.Receivers[i].Email
}
// if app.Notify.Receivers[i].Webhook != nil {
// c.Receivers[i].Webhook = &channel.WebhookReceiver{URL: app.Notify.Receivers[i].Webhook.URL}
// }
if app.Notify.Receivers[i].Webhook != nil {
c.Receivers[i].Webhook = &channel.WebhookReceiver{URL: app.Notify.Receivers[i].Webhook.URL}
}
}
c.Channels = channel.Config{}
@@ -438,8 +438,8 @@ func (n *notifier) send(ctx context.Context, work sendWork) error {
return channel.Send(ctx, work.receiver.Receiver, work.group)
}
var nlogKey = func(group *route.FeedGroup, receiver Receiver) string {
return fmt.Sprintf("notifier.group.%s.receiver.%s.%d", group.Name, receiver.Name, group.Time.Unix())
var nlogKey = func(group *route.FeedGroup, receiver Receiver) []byte {
return fmt.Appendf(nil, "notifier.group.%s.receiver.%s.%d", group.Name, receiver.Name, group.Time.Unix())
}
func (n *notifier) isSent(ctx context.Context, group *route.FeedGroup, receiver Receiver) bool {
@@ -457,7 +457,7 @@ func (n *notifier) isSent(ctx context.Context, group *route.FeedGroup, receiver
}
func (n *notifier) markSent(ctx context.Context, group *route.FeedGroup, receiver Receiver) error {
return n.Dependencies().KVStorage.Set(ctx, nlogKey(group, receiver), timeutil.Format(time.Now()), timeutil.Day)
return n.Dependencies().KVStorage.Set(ctx, nlogKey(group, receiver), []byte(timeutil.Format(time.Now())), timeutil.Day)
}
type sendWork struct {

View File

@@ -72,56 +72,25 @@ func (s SubRoutes) Match(feed *block.FeedVO) *SubRoute {
type SubRoute struct {
Route
Matchers []string
matchers []matcher
matchers model.LabelFilters
}
func (r *SubRoute) Match(feed *block.FeedVO) *SubRoute {
// Match sub routes.
for _, subRoute := range r.SubRoutes {
if matched := subRoute.Match(feed); matched != nil {
return matched
}
}
for _, m := range r.matchers {
fv := feed.Labels.Get(m.key)
switch m.equal {
case true:
if fv != m.value {
return nil
}
default:
if fv == m.value {
return nil
}
}
// Match self.
if !r.matchers.Match(feed.Labels) {
return nil
}
return r
}
type matcher struct {
key string
value string
equal bool
}
var (
matcherEqual = "="
matcherNotEqual = "!="
parseMatcher = func(filter string) (matcher, error) {
eq := false
parts := strings.Split(filter, matcherNotEqual)
if len(parts) != 2 {
parts = strings.Split(filter, matcherEqual)
eq = true
}
if len(parts) != 2 {
return matcher{}, errors.New("invalid matcher")
}
return matcher{key: parts[0], value: parts[1], equal: eq}, nil
}
)
func (r *SubRoute) Validate() error {
if len(r.GroupBy) == 0 {
r.GroupBy = []string{model.LabelSource}
@@ -129,17 +98,16 @@ func (r *SubRoute) Validate() error {
if r.CompressByRelatedThreshold == nil {
r.CompressByRelatedThreshold = ptr.To(float32(0.85))
}
if len(r.Matchers) == 0 {
return errors.New("matchers is required")
}
r.matchers = make([]matcher, len(r.Matchers))
for i, matcher := range r.Matchers {
m, err := parseMatcher(matcher)
if err != nil {
return errors.Wrap(err, "invalid matcher")
}
r.matchers[i] = m
matchers, err := model.NewLabelFilters(r.Matchers)
if err != nil {
return errors.Wrap(err, "invalid matchers")
}
r.matchers = matchers
for _, subRoute := range r.SubRoutes {
if err := subRoute.Validate(); err != nil {
return errors.Wrap(err, "invalid sub_route")
@@ -151,7 +119,7 @@ func (r *SubRoute) Validate() error {
func (c *Config) Validate() error {
if len(c.GroupBy) == 0 {
c.GroupBy = []string{model.LabelSource}
c.GroupBy = []string{model.LabelType}
}
if c.CompressByRelatedThreshold == nil {
c.CompressByRelatedThreshold = ptr.To(float32(0.85))
@@ -179,8 +147,8 @@ type FeedGroup struct {
Name string
Time time.Time
Labels model.Labels
Feeds []*Feed
Summary string
Feeds []*Feed
}
func (g *FeedGroup) ID() string {

View File

@@ -19,8 +19,8 @@ import (
"context"
"html/template"
"regexp"
"strings"
"unicode/utf8"
"unsafe"
"github.com/pkg/errors"
"k8s.io/utils/ptr"
@@ -28,14 +28,15 @@ import (
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
"github.com/glidea/zenfeed/pkg/llm"
"github.com/glidea/zenfeed/pkg/llm/prompt"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
"github.com/glidea/zenfeed/pkg/util/buffer"
"github.com/glidea/zenfeed/pkg/util/crawl"
)
// --- Interface code block ---
type Rewriter interface {
component.Component
config.Watcher
@@ -71,6 +72,11 @@ type Dependencies struct {
}
type Rule struct {
// If is the condition to check before applying the rule.
// If not set, the rule will be applied.
If []string
if_ model.LabelFilters
// SourceLabel specifies which label's value to use as source text.
// Default is model.LabelContent.
SourceLabel string
@@ -96,29 +102,51 @@ type Rule struct {
}
func (r *Rule) Validate() error { //nolint:cyclop
// If.
if len(r.If) > 0 {
if_, err := model.NewLabelFilters(r.If)
if err != nil {
return errors.Wrapf(err, "invalid if %q", r.If)
}
r.if_ = if_
}
// Source label.
if r.SourceLabel == "" {
r.SourceLabel = model.LabelContent
}
if r.SkipTooShortThreshold == nil {
r.SkipTooShortThreshold = ptr.To(300)
r.SkipTooShortThreshold = ptr.To(0)
}
// Transform.
if r.Transform != nil {
if r.Transform.ToText.Prompt == "" {
return errors.New("to text prompt is required")
if r.Transform.ToText == nil {
return errors.New("to_text is required when transform is set")
}
tmpl, err := template.New("").Parse(r.Transform.ToText.Prompt)
if err != nil {
return errors.Wrapf(err, "parse prompt template %s", r.Transform.ToText.Prompt)
switch r.Transform.ToText.Type {
case ToTextTypePrompt:
if r.Transform.ToText.Prompt == "" {
return errors.New("to text prompt is required for prompt type")
}
tmpl, err := template.New("").Parse(r.Transform.ToText.Prompt)
if err != nil {
return errors.Wrapf(err, "parse prompt template %s", r.Transform.ToText.Prompt)
}
buf := buffer.Get()
defer buffer.Put(buf)
if err := tmpl.Execute(buf, prompt.Builtin); err != nil {
return errors.Wrapf(err, "execute prompt template %s", r.Transform.ToText.Prompt)
}
r.Transform.ToText.promptRendered = buf.String()
case ToTextTypeCrawl, ToTextTypeCrawlByJina:
// No specific validation for crawl type here, as the source text itself is the URL.
default:
return errors.Errorf("unknown transform type: %s", r.Transform.ToText.Type)
}
buf := buffer.Get()
defer buffer.Put(buf)
if err := tmpl.Execute(buf, promptTemplates); err != nil {
return errors.Wrapf(err, "execute prompt template %s", r.Transform.ToText.Prompt)
}
r.Transform.ToText.promptRendered = buf.String()
}
// Match.
@@ -148,15 +176,21 @@ func (r *Rule) Validate() error { //nolint:cyclop
}
func (r *Rule) From(c *config.RewriteRule) {
r.If = c.If
r.SourceLabel = c.SourceLabel
r.SkipTooShortThreshold = c.SkipTooShortThreshold
if c.Transform != nil {
t := &Transform{}
if c.Transform.ToText != nil {
t.ToText = &ToText{
toText := &ToText{
LLM: c.Transform.ToText.LLM,
Prompt: c.Transform.ToText.Prompt,
}
toText.Type = ToTextType(c.Transform.ToText.Type)
if toText.Type == "" {
toText.Type = ToTextTypePrompt // Default to prompt if not specified.
}
t.ToText = toText
}
r.Transform = t
}
@@ -173,15 +207,27 @@ type Transform struct {
}
type ToText struct {
Type ToTextType
// LLM is the name of the LLM to use.
// Only used when Type is ToTextTypePrompt.
LLM string
// Prompt is the prompt for LLM completion.
// The source text will automatically be injected into the prompt.
// Only used when Type is ToTextTypePrompt.
Prompt string
promptRendered string
}
type ToTextType string
const (
ToTextTypePrompt ToTextType = "prompt"
ToTextTypeCrawl ToTextType = "crawl"
ToTextTypeCrawlByJina ToTextType = "crawl_by_jina"
)
type Action string
const (
@@ -189,233 +235,7 @@ const (
ActionCreateOrUpdateLabel Action = "create_or_update_label"
)
var promptTemplates = map[string]string{
"category": `
Analyze the content and categorize it into exactly one of these categories:
Technology, Development, Entertainment, Finance, Health, Politics, Other
Classification requirements:
- Choose the SINGLE most appropriate category based on:
* Primary topic and main focus of the content
* Key terminology and concepts used
* Target audience and purpose
* Technical depth and complexity level
- For content that could fit multiple categories:
* Identify the dominant theme
* Consider the most specific applicable category
* Use the primary intended purpose
- If content appears ambiguous:
* Focus on the most prominent aspects
* Consider the practical application
* Choose the category that best serves user needs
Output format:
Return ONLY the category name, no other text or explanation.
Must be one of the provided categories exactly as written.
`,
"tags": `
Analyze the content and add appropriate tags based on:
- Main topics and themes
- Key concepts and terminology
- Target audience and purpose
- Technical depth and domain
- 2-4 tags are enough
Output format:
Return a list of tags, separated by commas, no other text or explanation.
e.g. "AI, Technology, Innovation, Future"
`,
"score": `
Please give a score between 0 and 10 based on the following content.
Evaluate the content comprehensively considering clarity, accuracy, depth, logical structure, language expression, and completeness.
Note: If the content is an article or a text intended to be detailed, the length is an important factor. Generally, content under 300 words may receive a lower score due to lack of substance, unless its type (such as poetry or summary) is inherently suitable for brevity.
Output format:
Return the score (0-10), no other text or explanation.
E.g. "8", "5", "3", etc.
`,
"comment_confucius": `
Please act as Confucius and write a 100-word comment on the article.
Content needs to be in line with the Chinese mainland's regulations.
Output format:
Return the comment only, no other text or explanation.
Reply short and concise, 100 words is enough.
`,
"summary": `
Summarize the article in 100-200 words.
`,
"summary_html_snippet": `
# Task: Create Visually Appealing Information Summary Emails
You are a professional content designer. Please convert the provided articles into **visually modern HTML email segments**, focusing on display effects in modern clients like Gmail and QQ Mail.
## Key Requirements:
1. **Output Format**:
- Only output HTML code snippets, **no need for complete HTML document structure**
- Only generate HTML code for a single article, so users can combine multiple pieces into a complete email
- No explanations, additional comments, or markups
- **No need to add titles and sources**, users will inject them automatically
- No use html backticks, output raw html code directly
- Output directly, no explanation, no comments, no markups
2. **Content Processing**:
- **Don't directly copy the original text**, but extract key information and core insights from each article
- **Each article summary should be 100-200 words**, don't force word count, adjust the word count based on the actual length of the article
- Summarize points in relaxed, natural language, as if chatting with friends, while maintaining depth
- Maintain the original language of the article (e.g., Chinese summary for Chinese articles)
3. **Visual Design**:
- Design should be aesthetically pleasing with coordinated colors
- Use sufficient whitespace and contrast
- Maintain a consistent visual style across all articles
- **Must use multiple visual elements** (charts, cards, quote blocks, etc.), avoid pure text presentation
- Each article should use at least 2-3 different visual elements to make content more intuitive and readable
4. **Highlight Techniques**:
A. **Beautiful Quote Blocks** (for highlighting important viewpoints):
<div style="margin:20px 0; padding:20px; background:linear-gradient(to right, #f8f9fa, #ffffff); border-left:5px solid #4285f4; border-radius:5px; box-shadow:0 2px 8px rgba(0,0,0,0.05);">
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; line-height:1.6; color:#333; font-weight:500;">
Here is the key viewpoint or finding that needs to be highlighted.
</p>
</div>
B. **Information Cards** (for highlighting key data):
<div style="display:inline-block; margin:10px 10px 10px 0; padding:15px 20px; background-color:#ffffff; border-radius:8px; box-shadow:0 3px 10px rgba(0,0,0,0.08); min-width:120px; text-align:center;">
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#666;">Metric Name</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:24px; font-weight:600; color:#1a73e8;">75%</p>
</div>
C. **Key Points List** (for highlighting multiple points):
<ul style="margin:20px 0; padding-left:0; list-style-type:none;">
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">1</span>
First point description
</li>
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">2</span>
Second point description
</li>
</ul>
D. **Emphasis Text** (for highlighting key words or phrases):
<span style="background:linear-gradient(180deg, rgba(255,255,255,0) 50%, rgba(66,133,244,0.2) 50%); padding:0 2px;">Text to emphasize</span>
5. **Timeline Design** (suitable for event sequences or news developments):
<div style="margin:25px 0; padding:5px 0;">
<h3 style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:18px; color:#333; margin-bottom:15px;">Event Development Timeline</h3>
<div style="position:relative; margin-left:30px; padding-left:30px; border-left:2px solid #e0e0e0;">
<!-- Time Point 1 -->
<div style="position:relative; margin-bottom:25px;">
<div style="position:absolute; width:16px; height:16px; background-color:#4285f4; border-radius:50%; left:-40px; top:0; border:3px solid #ffffff; box-shadow:0 2px 5px rgba(0,0,0,0.1);"></div>
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#4285f4;">June 1, 2023</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.5; color:#333;">Event description content, concisely explaining the key points and impact of the event.</p>
</div>
<!-- Time Point 2 -->
<div style="position:relative; margin-bottom:25px;">
<div style="position:absolute; width:16px; height:16px; background-color:#4285f4; border-radius:50%; left:-40px; top:0; border:3px solid #ffffff; box-shadow:0 2px 5px rgba(0,0,0,0.1);"></div>
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#4285f4;">June 15, 2023</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.5; color:#333;">Event description content, concisely explaining the key points and impact of the event.</p>
</div>
</div>
</div>
6. **Comparison Table** (for comparing different options or viewpoints):
<div style="margin:25px 0; padding:15px; background-color:#f8f9fa; border-radius:8px; overflow-x:auto;">
<table style="width:100%; border-collapse:collapse; font-family:'Google Sans',Roboto,Arial,sans-serif;">
<thead>
<tr>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Feature</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option A</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option B</th>
</tr>
</thead>
<tbody>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Cost</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Higher</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Moderate</td>
</tr>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Efficiency</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Very High</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Average</td>
</tr>
</tbody>
</table>
</div>
7. **Chart Data Processing**:
- Bar Chart/Horizontal Bars:
<div style="margin:20px 0; padding:15px; background-color:#f8f9fa; border-radius:8px;">
<p style="margin:0 0 15px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; font-weight:500; color:#333;">Data Comparison</p>
<!-- Item 1 -->
<div style="margin-bottom:12px;">
<div style="display:flex; align-items:center; justify-content:space-between; margin-bottom:5px;">
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#555;">Project A</span>
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#333;">65%</span>
</div>
<div style="height:10px; width:100%; background-color:#e8eaed; border-radius:5px; overflow:hidden;">
<div style="height:100%; width:65%; background:linear-gradient(to right, #4285f4, #5e97f6); border-radius:5px;"></div>
</div>
</div>
<!-- Item 2 -->
<div style="margin-bottom:12px;">
<div style="display:flex; align-items:center; justify-content:space-between; margin-bottom:5px;">
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#555;">Project B</span>
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#333;">42%</span>
</div>
<div style="height:10px; width:100%; background-color:#e8eaed; border-radius:5px; overflow:hidden;">
<div style="height:100%; width:42%; background:linear-gradient(to right, #ea4335, #f07575); border-radius:5px;"></div>
</div>
</div>
</div>
8. **Highlight Box** (for displaying tips or reminders):
<div style="margin:25px 0; padding:20px; background-color:#fffde7; border-radius:8px; border-left:4px solid #fdd835; box-shadow:0 1px 5px rgba(0,0,0,0.05);">
<div style="display:flex; align-items:flex-start;">
<div style="flex-shrink:0; margin-right:15px; width:24px; height:24px; background-color:#fdd835; border-radius:50%; display:flex; align-items:center; justify-content:center;">
<span style="color:#fff; font-weight:bold; font-size:16px;">!</span>
</div>
<div>
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; font-weight:500; color:#333;">Tip</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#555;">
Here are some additional tips or suggestions to help readers better understand or apply the article content.
</p>
</div>
</div>
</div>
9. **Summary Box**:
<div style="margin:25px 0; padding:20px; background-color:#f2f7fd; border-radius:8px; box-shadow:0 1px 5px rgba(66,133,244,0.1);">
<p style="margin:0 0 10px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; font-weight:500; color:#1a73e8;">In Simple Terms</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#333;">
This is a concise summary of the entire content, highlighting the most critical findings and conclusions.
</p>
</div>
## Notes:
1. **Only generate content for a single article**, not including title and source, and not including HTML head and tail structure
2. Content should be **200-300 words**, don't force word count
3. **Must use multiple visual elements** (at least 2-3 types), avoid monotonous pure text presentation
4. Use relaxed, natural language, as if chatting with friends
5. Create visual charts for important data, rather than just describing with text
6. Use quote blocks to highlight important viewpoints, and lists to organize multiple points
7. Appropriately use emojis and conversational expressions to increase friendliness
8. Note that the article content has been provided in the previous message, please reply directly, no explanation, no comments, no markups
`,
}
// --- Factory code block ---
type Factory component.Factory[Rewriter, config.App, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
@@ -445,6 +265,8 @@ func new(instance string, app *config.App, dependencies Dependencies) (Rewriter,
Config: c,
Dependencies: dependencies,
}),
crawler: crawl.NewLocal(),
jinaCrawler: crawl.NewJina(app.Jina.Token),
}, nil
}
@@ -452,6 +274,9 @@ func new(instance string, app *config.App, dependencies Dependencies) (Rewriter,
type rewriter struct {
*component.Base[Config, Dependencies]
crawler crawl.Crawler
jinaCrawler crawl.Crawler
}
func (r *rewriter) Reload(app *config.App) error {
@@ -462,6 +287,8 @@ func (r *rewriter) Reload(app *config.App) error {
}
r.SetConfig(newConfig)
r.jinaCrawler = crawl.NewJina(app.Jina.Token)
return nil
}
@@ -471,6 +298,11 @@ func (r *rewriter) Labels(ctx context.Context, labels model.Labels) (rewritten m
rules := *r.Config()
for _, rule := range rules {
// If.
if !rule.if_.Match(labels) {
continue
}
// Get source text based on source label.
sourceText := labels.Get(rule.SourceLabel)
if utf8.RuneCountInString(sourceText) < *rule.SkipTooShortThreshold {
@@ -479,7 +311,7 @@ func (r *rewriter) Labels(ctx context.Context, labels model.Labels) (rewritten m
// Transform text if configured.
text := sourceText
if rule.Transform != nil {
if rule.Transform != nil && rule.Transform.ToText != nil {
transformed, err := r.transformText(ctx, rule.Transform, sourceText)
if err != nil {
return nil, errors.Wrap(err, "transform text")
@@ -506,15 +338,37 @@ func (r *rewriter) Labels(ctx context.Context, labels model.Labels) (rewritten m
return labels, nil
}
// transformText transforms text using configured LLM.
// transformText transforms text using configured LLM or by crawling a URL.
func (r *rewriter) transformText(ctx context.Context, transform *Transform, text string) (string, error) {
switch transform.ToText.Type {
case ToTextTypeCrawl:
return r.transformTextCrawl(ctx, r.crawler, text)
case ToTextTypeCrawlByJina:
return r.transformTextCrawl(ctx, r.jinaCrawler, text)
case ToTextTypePrompt:
return r.transformTextPrompt(ctx, transform, text)
default:
return r.transformTextPrompt(ctx, transform, text)
}
}
func (r *rewriter) transformTextCrawl(ctx context.Context, crawler crawl.Crawler, url string) (string, error) {
mdBytes, err := crawler.Markdown(ctx, url)
if err != nil {
return "", errors.Wrapf(err, "crawl %s", url)
}
return string(mdBytes), nil
}
// transformTextPrompt transforms text using configured LLM.
func (r *rewriter) transformTextPrompt(ctx context.Context, transform *Transform, text string) (string, error) {
// Get LLM instance.
llm := r.Dependencies().LLMFactory.Get(transform.ToText.LLM)
// Call completion.
result, err := llm.String(ctx, []string{
transform.ToText.promptRendered,
"The content to be processed is below, and the processing requirements are as above",
text, // TODO: may place to first line to hit the model cache in different rewrite rules.
})
if err != nil {
@@ -525,32 +379,11 @@ func (r *rewriter) transformText(ctx context.Context, transform *Transform, text
}
func (r *rewriter) transformTextHack(text string) string {
bytes := unsafe.Slice(unsafe.StringData(text), len(text))
start := 0
end := len(bytes)
// Remove the last line if it's empty.
// This is a hack to avoid the model output a empty line.
// E.g. category: tech\n
if end > 0 && bytes[end-1] == '\n' {
end--
}
// Remove the html backticks.
if end-start >= 7 && string(bytes[start:start+7]) == "```html" {
start += 7
}
if end-start >= 3 && string(bytes[end-3:end]) == "```" {
end -= 3
}
// If no changes, return the original string.
if start == 0 && end == len(bytes) {
return text
}
// Only copy one time.
return string(bytes[start:end])
// TODO: optimize this.
text = strings.ReplaceAll(text, "```html", "")
text = strings.ReplaceAll(text, "```markdown", "")
text = strings.ReplaceAll(text, "```", "")
return text
}
type mockRewriter struct {

View File

@@ -44,6 +44,7 @@ func TestLabels(t *testing.T) {
SkipTooShortThreshold: ptr.To(10),
Transform: &Transform{
ToText: &ToText{
Type: ToTextTypePrompt,
LLM: "mock-llm",
Prompt: "{{ .category }}", // Using a simple template for testing
},
@@ -79,6 +80,7 @@ func TestLabels(t *testing.T) {
SkipTooShortThreshold: ptr.To(10),
Transform: &Transform{
ToText: &ToText{
Type: ToTextTypePrompt,
LLM: "mock-llm",
Prompt: "{{ .category }}",
},
@@ -148,6 +150,7 @@ func TestLabels(t *testing.T) {
SkipTooShortThreshold: ptr.To(10),
Transform: &Transform{
ToText: &ToText{
Type: ToTextTypePrompt,
LLM: "mock-llm",
Prompt: "{{ .category }}",
promptRendered: "Analyze the content and categorize it...",
@@ -186,6 +189,7 @@ func TestLabels(t *testing.T) {
SkipTooShortThreshold: ptr.To(10),
Transform: &Transform{
ToText: &ToText{
Type: ToTextTypePrompt,
LLM: "mock-llm",
Prompt: "{{ .category }}",
promptRendered: "Analyze the content and categorize it...",

View File

@@ -55,7 +55,7 @@ func (r *periodic) Run() (err error) {
end := time.Date(today.Year(), today.Month(), today.Day(),
config.end.Hour(), config.end.Minute(), 0, 0, today.Location())
buffer := 20 * time.Minute
buffer := 30 * time.Minute
endPlusBuffer := end.Add(buffer)
if now.Before(end) || now.After(endPlusBuffer) {
return

View File

@@ -18,7 +18,6 @@ package rule
import (
"strings"
"time"
"unicode/utf8"
"github.com/pkg/errors"
@@ -58,11 +57,8 @@ func (c *Config) Validate() error { //nolint:cyclop,gocognit
if c.Name == "" {
return errors.New("name is required")
}
if c.Query != "" && utf8.RuneCountInString(c.Query) < 5 {
return errors.New("query must be at least 5 characters")
}
if c.Threshold == 0 {
c.Threshold = 0.6
c.Threshold = 0.5
}
if c.Threshold < 0 || c.Threshold > 1 {
return errors.New("threshold must be between 0 and 1")

View File

@@ -65,7 +65,6 @@ func newRSSReader(config *ScrapeSourceRSS) (reader, error) {
}
// --- Implementation code block ---
type rssReader struct {
config *ScrapeSourceRSS
client client

View File

@@ -227,7 +227,7 @@ func (s *scraper) filterExists(ctx context.Context, feeds []*model.Feed) (filter
appendToResult := func(feed *model.Feed) {
key := keyPrefix + strconv.FormatUint(feed.ID, 10)
value := timeutil.Format(feed.Time)
if err := s.Dependencies().KVStorage.Set(ctx, key, value, ttl); err != nil {
if err := s.Dependencies().KVStorage.Set(ctx, []byte(key), []byte(value), ttl); err != nil {
log.Error(ctx, err, "set last try store time")
}
filtered = append(filtered, feed)
@@ -236,7 +236,7 @@ func (s *scraper) filterExists(ctx context.Context, feeds []*model.Feed) (filter
for _, feed := range feeds {
key := keyPrefix + strconv.FormatUint(feed.ID, 10)
lastTryStored, err := s.Dependencies().KVStorage.Get(ctx, key)
lastTryStored, err := s.Dependencies().KVStorage.Get(ctx, []byte(key))
switch {
default:
log.Error(ctx, err, "get last stored time, fallback to continue writing")
@@ -246,7 +246,7 @@ func (s *scraper) filterExists(ctx context.Context, feeds []*model.Feed) (filter
appendToResult(feed)
case err == nil:
t, err := timeutil.Parse(lastTryStored)
t, err := timeutil.Parse(string(lastTryStored))
if err != nil {
log.Error(ctx, err, "parse last try stored time, fallback to continue writing")
appendToResult(feed)

View File

@@ -26,7 +26,6 @@ import (
"runtime"
"slices"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
@@ -277,47 +276,20 @@ type QueryOptions struct {
Query string
Threshold float32
LabelFilters []string
labelFilters []LabelFilter
labelFilters model.LabelFilters
Limit int
Start, End time.Time
}
var (
LabelFilterEqual = "="
LabelFilterNotEqual = "!="
NewLabelFilter = func(key, value string, eq bool) string {
if eq {
return fmt.Sprintf("%s%s%s", key, LabelFilterEqual, value)
}
return fmt.Sprintf("%s%s%s", key, LabelFilterNotEqual, value)
}
ParseLabelFilter = func(filter string) (LabelFilter, error) {
eq := false
parts := strings.Split(filter, LabelFilterNotEqual)
if len(parts) != 2 {
parts = strings.Split(filter, LabelFilterEqual)
eq = true
}
if len(parts) != 2 {
return LabelFilter{}, errors.New("invalid label filter")
}
return LabelFilter{Label: parts[0], Value: parts[1], Equal: eq}, nil
}
)
func (q *QueryOptions) Validate() error { //nolint:cyclop
if q.Threshold < 0 || q.Threshold > 1 {
return errors.New("threshold must be between 0 and 1")
}
for _, labelFilter := range q.LabelFilters {
if labelFilter == "" {
for _, s := range q.LabelFilters {
if s == "" {
return errors.New("label filter is required")
}
filter, err := ParseLabelFilter(labelFilter)
filter, err := model.NewLabelFilter(s)
if err != nil {
return errors.Wrap(err, "parse label filter")
}
@@ -368,13 +340,6 @@ func (q *QueryOptions) HitTimeRangeCondition(b Block) bool {
return queryAsBase || blockAsBase
}
// LabelFilter defines the matcher for an item.
type LabelFilter struct {
Label string
Equal bool
Value string
}
// --- Factory code block ---
type Factory component.Factory[Block, Config, Dependencies]
@@ -1228,14 +1193,14 @@ func (b *block) applyFilters(ctx context.Context, query *QueryOptions) (res filt
return b.mergeFilterResults(labelsResult, vectorsResult), nil
}
func (b *block) applyLabelFilters(ctx context.Context, filters []LabelFilter) filterResult {
func (b *block) applyLabelFilters(ctx context.Context, filters model.LabelFilters) filterResult {
if len(filters) == 0 {
return matchedAllFilterResult
}
var allIDs map[uint64]struct{}
for _, filter := range filters {
ids := b.invertedIndex.Search(ctx, filter.Label, filter.Equal, filter.Value)
ids := b.invertedIndex.Search(ctx, filter)
if len(ids) == 0 {
return matchedNothingFilterResult
}
@@ -1317,7 +1282,7 @@ func (b *block) mergeFilterResults(x, y filterResult) filterResult {
}
func (b *block) fillEmbedding(ctx context.Context, feeds []*model.Feed) ([]*chunk.Feed, error) {
embedded := make([]*chunk.Feed, len(feeds))
embedded := make([]*chunk.Feed, 0, len(feeds))
llm := b.Dependencies().LLMFactory.Get(b.Config().embeddingLLM)
var wg sync.WaitGroup
var mu sync.Mutex
@@ -1336,16 +1301,21 @@ func (b *block) fillEmbedding(ctx context.Context, feeds []*model.Feed) ([]*chun
}
mu.Lock()
embedded[i] = &chunk.Feed{
embedded = append(embedded, &chunk.Feed{
Feed: feed,
Vectors: vectors,
}
})
mu.Unlock()
}(i, feed)
}
wg.Wait()
if len(errs) > 0 {
return nil, errs[0]
switch len(errs) {
case 0:
case len(feeds):
return nil, errs[0] // All failed.
default:
log.Error(ctx, errors.Wrap(errs[0], "fill embedding"), "error_count", len(errs))
}
return embedded, nil

View File

@@ -24,7 +24,7 @@ type Index interface {
index.Codec
// Search returns item IDs matching the given label and value.
Search(ctx context.Context, label string, eq bool, value string) (ids map[uint64]struct{})
Search(ctx context.Context, matcher model.LabelFilter) (ids map[uint64]struct{})
// Add adds item to the index.
// If label or value in labels is empty, it will be ignored.
// If value is too long, it will be ignored,
@@ -88,17 +88,17 @@ type idx struct {
mu sync.RWMutex
}
func (idx *idx) Search(ctx context.Context, label string, eq bool, value string) (ids map[uint64]struct{}) {
func (idx *idx) Search(ctx context.Context, matcher model.LabelFilter) (ids map[uint64]struct{}) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "Search")...)
defer func() { telemetry.End(ctx, nil) }()
idx.mu.RLock()
defer idx.mu.RUnlock()
if value == "" {
return idx.searchEmptyValue(label, eq)
if matcher.Value == "" {
return idx.searchEmptyValue(matcher.Label, matcher.Equal)
}
return idx.searchNonEmptyValue(label, eq, value)
return idx.searchNonEmptyValue(matcher)
}
func (idx *idx) Add(ctx context.Context, id uint64, labels model.Labels) {
@@ -198,16 +198,16 @@ func (idx *idx) searchEmptyValue(label string, eq bool) map[uint64]struct{} {
// searchNonEmptyValue handles the search logic when the target value is not empty.
// If eq is true, it returns IDs that have the exact label-value pair.
// If eq is false, it returns IDs that *do not* have the exact label-value pair.
func (idx *idx) searchNonEmptyValue(label string, eq bool, value string) map[uint64]struct{} {
func (idx *idx) searchNonEmptyValue(matcher model.LabelFilter) map[uint64]struct{} {
// Get the map of values for the given label.
values, labelExists := idx.m[label]
values, labelExists := idx.m[matcher.Label]
// If equal (eq), find the exact match.
if eq {
if matcher.Equal {
if !labelExists {
return make(map[uint64]struct{}) // Label doesn't exist.
}
ids, valueExists := values[value]
ids, valueExists := values[matcher.Value]
if !valueExists {
return make(map[uint64]struct{}) // Value doesn't exist for this label.
}
@@ -221,7 +221,7 @@ func (idx *idx) searchNonEmptyValue(label string, eq bool, value string) map[uin
resultIDs := maps.Clone(idx.ids)
if labelExists {
// If the specific label-value pair exists, remove its associated IDs.
if matchingIDs, valueExists := values[value]; valueExists {
if matchingIDs, valueExists := values[matcher.Value]; valueExists {
for id := range matchingIDs {
delete(resultIDs, id)
}
@@ -413,8 +413,8 @@ type mockIndex struct {
component.Mock
}
func (m *mockIndex) Search(ctx context.Context, label string, eq bool, value string) (ids map[uint64]struct{}) {
args := m.Called(ctx, label, eq, value)
func (m *mockIndex) Search(ctx context.Context, matcher model.LabelFilter) (ids map[uint64]struct{}) {
args := m.Called(ctx, matcher)
return args.Get(0).(map[uint64]struct{})
}

View File

@@ -118,9 +118,7 @@ func TestSearch(t *testing.T) {
setupLabels map[uint64]model.Labels
}
type whenDetail struct {
searchLabel string
eq bool
searchValue string
matcher model.LabelFilter
}
type thenExpected struct {
want []uint64
@@ -140,9 +138,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "category",
searchValue: "tech",
eq: true,
matcher: model.LabelFilter{
Label: "category",
Value: "tech",
Equal: true,
},
},
ThenExpected: thenExpected{
want: []uint64{1, 2},
@@ -159,9 +159,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "invalid",
searchValue: "value",
eq: true,
matcher: model.LabelFilter{
Label: "invalid",
Value: "value",
Equal: true,
},
},
ThenExpected: thenExpected{
want: nil,
@@ -178,9 +180,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "category",
searchValue: "invalid",
eq: true,
matcher: model.LabelFilter{
Label: "category",
Value: "invalid",
Equal: true,
},
},
ThenExpected: thenExpected{
want: nil,
@@ -200,9 +204,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "category",
searchValue: "tech",
eq: false,
matcher: model.LabelFilter{
Label: "category",
Value: "tech",
Equal: false,
},
},
ThenExpected: thenExpected{
want: []uint64{2},
@@ -220,9 +226,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "invalid",
searchValue: "value",
eq: false,
matcher: model.LabelFilter{
Label: "invalid",
Value: "value",
Equal: false,
},
},
ThenExpected: thenExpected{
want: []uint64{1, 2},
@@ -240,7 +248,7 @@ func TestSearch(t *testing.T) {
}
// When.
result := idx.Search(context.Background(), tt.WhenDetail.searchLabel, tt.WhenDetail.eq, tt.WhenDetail.searchValue)
result := idx.Search(context.Background(), tt.WhenDetail.matcher)
// Then.
if tt.ThenExpected.want == nil {

View File

@@ -32,8 +32,8 @@ import (
// --- Interface code block ---
type Storage interface {
component.Component
Get(ctx context.Context, key string) (string, error)
Set(ctx context.Context, key string, value string, ttl time.Duration) error
Get(ctx context.Context, key []byte) ([]byte, error)
Set(ctx context.Context, key []byte, value []byte, ttl time.Duration) error
}
var ErrNotFound = errors.New("not found")
@@ -137,7 +137,7 @@ func (k *kv) Close() error {
const bucket = "0"
func (k *kv) Get(ctx context.Context, key string) (value string, err error) {
func (k *kv) Get(ctx context.Context, key []byte) (value []byte, err error) {
ctx = telemetry.StartWith(ctx, append(k.TelemetryLabels(), telemetrymodel.KeyOperation, "Get")...)
defer func() {
telemetry.End(ctx, func() error {
@@ -157,22 +157,22 @@ func (k *kv) Get(ctx context.Context, key string) (value string, err error) {
})
switch {
case err == nil:
return string(b), nil
return b, nil
case errors.Is(err, nutsdb.ErrNotFoundKey):
return "", ErrNotFound
return nil, ErrNotFound
case strings.Contains(err.Error(), "key not found"):
return "", ErrNotFound
return nil, ErrNotFound
default:
return "", err
return nil, err
}
}
func (k *kv) Set(ctx context.Context, key string, value string, ttl time.Duration) (err error) {
func (k *kv) Set(ctx context.Context, key []byte, value []byte, ttl time.Duration) (err error) {
ctx = telemetry.StartWith(ctx, append(k.TelemetryLabels(), telemetrymodel.KeyOperation, "Set")...)
defer func() { telemetry.End(ctx, err) }()
return k.db.Update(func(tx *nutsdb.Tx) error {
return tx.Put(bucket, []byte(key), []byte(value), uint32(ttl.Seconds()))
return tx.Put(bucket, key, value, uint32(ttl.Seconds()))
})
}
@@ -180,13 +180,13 @@ type mockKV struct {
component.Mock
}
func (m *mockKV) Get(ctx context.Context, key string) (string, error) {
func (m *mockKV) Get(ctx context.Context, key []byte) ([]byte, error) {
args := m.Called(ctx, key)
return args.String(0), args.Error(1)
return args.Get(0).([]byte), args.Error(1)
}
func (m *mockKV) Set(ctx context.Context, key string, value string, ttl time.Duration) error {
func (m *mockKV) Set(ctx context.Context, key []byte, value []byte, ttl time.Duration) error {
args := m.Called(ctx, key, value, ttl)
return args.Error(0)

View File

@@ -27,6 +27,8 @@ import (
"github.com/pkg/errors"
slogdedup "github.com/veqryn/slog-dedup"
"github.com/glidea/zenfeed/pkg/model"
)
type Level string
@@ -187,7 +189,8 @@ func getStack(skip, depth int) string {
}
first = false
b.WriteString(frame.Function)
fn := strings.TrimPrefix(frame.Function, model.Module) // no module prefix for zenfeed self.
b.WriteString(fn)
b.WriteByte(':')
b.WriteString(strconv.Itoa(frame.Line))
}

View File

@@ -0,0 +1,137 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package http
import (
"net"
"net/http"
"net/http/pprof"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
telemetry "github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
"github.com/glidea/zenfeed/pkg/telemetry/metric"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
)
// --- Interface code block ---
type Server interface {
component.Component
}
type Config struct {
Address string
}
func (c *Config) Validate() error {
if c.Address == "" {
c.Address = ":9090"
}
if _, _, err := net.SplitHostPort(c.Address); err != nil {
return errors.Wrap(err, "invalid address")
}
return nil
}
func (c *Config) From(app *config.App) *Config {
c.Address = app.Telemetry.Address
return c
}
type Dependencies struct {
}
// --- Factory code block ---
type Factory component.Factory[Server, config.App, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[Server, config.App, Dependencies](
func(instance string, config *config.App, dependencies Dependencies) (Server, error) {
m := &mockServer{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
},
)
}
return component.FactoryFunc[Server, config.App, Dependencies](new)
}
func new(instance string, app *config.App, dependencies Dependencies) (Server, error) {
config := &Config{}
config.From(app)
if err := config.Validate(); err != nil {
return nil, errors.Wrap(err, "validate config")
}
router := http.NewServeMux()
router.Handle("/health", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
}))
router.Handle("/metrics", metric.Handler())
router.HandleFunc("/pprof", pprof.Index)
router.HandleFunc("/pprof/cmdline", pprof.Cmdline)
router.HandleFunc("/pprof/profile", pprof.Profile)
router.HandleFunc("/pprof/symbol", pprof.Symbol)
router.HandleFunc("/pprof/trace", pprof.Trace)
return &server{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "TelemetryServer",
Instance: instance,
Config: config,
Dependencies: dependencies,
}),
http: &http.Server{Addr: config.Address, Handler: router},
}, nil
}
// --- Implementation code block ---
type server struct {
*component.Base[Config, Dependencies]
http *http.Server
}
func (s *server) Run() (err error) {
ctx := telemetry.StartWith(s.Context(), append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Run")...)
defer func() { telemetry.End(ctx, err) }()
serverErr := make(chan error, 1)
go func() {
serverErr <- s.http.ListenAndServe()
}()
s.MarkReady()
select {
case <-ctx.Done():
log.Info(ctx, "shutting down")
return s.http.Shutdown(ctx)
case err := <-serverErr:
return errors.Wrap(err, "listen and serve")
}
}
type mockServer struct {
component.Mock
}

176
pkg/util/crawl/crawl.go Normal file
View File

@@ -0,0 +1,176 @@
package crawl
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"sync"
"github.com/pkg/errors"
"github.com/temoto/robotstxt"
"github.com/glidea/zenfeed/pkg/util/text_convert"
)
type Crawler interface {
Markdown(ctx context.Context, u string) ([]byte, error)
}
type local struct {
hc *http.Client
robotsDataCache sync.Map
}
func NewLocal() Crawler {
return &local{
hc: &http.Client{},
}
}
func (c *local) Markdown(ctx context.Context, u string) ([]byte, error) {
// Check if the page is allowed.
if err := c.checkAllowed(ctx, u); err != nil {
return nil, errors.Wrapf(err, "check robots.txt for %s", u)
}
// Prepare the request.
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", u)
}
req.Header.Set("User-Agent", userAgent)
// Send the request.
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", u)
}
defer resp.Body.Close()
// Parse the response.
if resp.StatusCode != http.StatusOK {
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, u)
}
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "read body from %s", u)
}
// Convert the body to markdown.
mdBytes, err := textconvert.HTMLToMarkdown(bodyBytes)
if err != nil {
return nil, errors.Wrap(err, "convert html to markdown")
}
return mdBytes, nil
}
const userAgent = "ZenFeed"
func (c *local) checkAllowed(ctx context.Context, u string) error {
parsedURL, err := url.Parse(u)
if err != nil {
return errors.Wrapf(err, "parse url %s", u)
}
d, err := c.getRobotsData(ctx, parsedURL.Host)
if err != nil {
return errors.Wrapf(err, "check robots.txt for %s", parsedURL.Host)
}
if !d.TestAgent(parsedURL.Path, userAgent) {
return errors.Errorf("disallowed by robots.txt for %s", u)
}
return nil
}
// getRobotsData fetches and parses robots.txt for a given host.
func (c *local) getRobotsData(ctx context.Context, host string) (*robotstxt.RobotsData, error) {
// Check the cache.
if data, found := c.robotsDataCache.Load(host); found {
return data.(*robotstxt.RobotsData), nil
}
// Prepare the request.
robotsURL := fmt.Sprintf("https://%s/robots.txt", host)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", robotsURL)
}
req.Header.Set("User-Agent", userAgent)
// Send the request.
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", robotsURL)
}
defer resp.Body.Close()
// Parse the response.
switch resp.StatusCode {
case http.StatusOK:
data, err := robotstxt.FromResponse(resp)
if err != nil {
return nil, errors.Wrapf(err, "parse robots.txt from %s", robotsURL)
}
c.robotsDataCache.Store(host, data)
return data, nil
case http.StatusNotFound:
data := &robotstxt.RobotsData{}
c.robotsDataCache.Store(host, data)
return data, nil
case http.StatusUnauthorized, http.StatusForbidden:
return nil, errors.Errorf("access to %s denied (status %d)", robotsURL, resp.StatusCode)
default:
return nil, errors.Errorf("unexpected status %d fetching %s", resp.StatusCode, robotsURL)
}
}
type jina struct {
hc *http.Client
token string
}
func NewJina(token string) Crawler {
return &jina{
hc: &http.Client{},
// If token is empty, will not affect to use, but rate limit will be lower.
// See https://jina.ai/api-dashboard/rate-limit.
token: token,
}
}
func (c *jina) Markdown(ctx context.Context, u string) ([]byte, error) {
proxyURL := fmt.Sprintf("https://r.jina.ai/%s", u)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, proxyURL, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", u)
}
req.Header.Set("X-Engine", "browser")
req.Header.Set("X-Robots-Txt", userAgent)
if c.token != "" {
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", c.token))
}
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", proxyURL)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, proxyURL)
}
mdBytes, err := io.ReadAll(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "read body from %s", proxyURL)
}
return mdBytes, nil
}

View File

@@ -13,39 +13,19 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package rpc
package jsonrpc
import (
"context"
"encoding/json"
"errors"
"net/http"
"github.com/glidea/zenfeed/pkg/api"
)
type Handler[Request any, Response any] func(ctx context.Context, req *Request) (*Response, error)
var (
ErrBadRequest = func(err error) Error { return newError(http.StatusBadRequest, err) }
ErrNotFound = func(err error) Error { return newError(http.StatusNotFound, err) }
ErrInternal = func(err error) Error { return newError(http.StatusInternalServerError, err) }
)
type Error struct {
Code int `json:"code"`
Message string `json:"message"`
}
func (e Error) Error() string {
return e.Message
}
func newError(code int, err error) Error {
return Error{
Code: code,
Message: err.Error(),
}
}
func API[Request any, Response any](handler Handler[Request, Response]) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
allowCORS(w)
@@ -65,11 +45,11 @@ func API[Request any, Response any](handler Handler[Request, Response]) http.Han
resp, err := handler(r.Context(), &req)
if err != nil {
var rpcErr Error
if errors.As(err, &rpcErr) {
var apiErr api.Error
if errors.As(err, &apiErr) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(rpcErr.Code)
_ = json.NewEncoder(w).Encode(rpcErr)
w.WriteHeader(apiErr.Code)
_ = json.NewEncoder(w).Encode(apiErr)
return
}

View File

@@ -13,7 +13,7 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package rpc
package jsonrpc
import (
"bytes"
@@ -27,6 +27,7 @@ import (
. "github.com/onsi/gomega"
"github.com/glidea/zenfeed/pkg/api"
"github.com/glidea/zenfeed/pkg/test"
)
@@ -58,15 +59,15 @@ func TestAPI(t *testing.T) {
}
badRequestHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
return nil, ErrBadRequest(errors.New("invalid request"))
return nil, api.ErrBadRequest(errors.New("invalid request"))
}
notFoundHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
return nil, ErrNotFound(errors.New("resource not found"))
return nil, api.ErrNotFound(errors.New("resource not found"))
}
internalErrorHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
return nil, ErrInternal(errors.New("server error"))
return nil, api.ErrInternal(errors.New("server error"))
}
genericErrorHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {