10 Commits

Author SHA1 Message Date
glidea
278cb662de marshal time.Duration as json string 2025-05-06 16:00:59 +08:00
glidea
8f32e427d4 update README 2025-05-06 11:31:39 +08:00
glidea
3049c49f7a fix dedup 2025-05-06 11:27:36 +08:00
glidea
14a4f2b8d4 fix rewrite error handing 2025-05-03 14:51:27 +08:00
glidea
6a869574fc update README 2025-05-02 11:38:58 +08:00
glidea
c581cbacda fix rewrite error handing 2025-05-01 19:19:34 +08:00
glidea
e7fe17a4bc update image 2025-04-30 20:16:28 +08:00
glidea
b35aaa3b68 update image 2025-04-30 20:13:25 +08:00
glidea
be83967168 update README 2025-04-30 11:41:44 +08:00
glidea
064bca1dda fix lint 2025-04-29 08:22:03 +08:00
11 changed files with 127 additions and 41 deletions

View File

@@ -1,8 +1,21 @@
[English](README-en.md)
zenfeed用 AI 赋能 RSS自动为你筛选、总结、推送重要信息告别信息过载重拾阅读掌控感。
![](docs/images/crad.png)
开箱即用的公共服务站https://zenfeed.xyz (集成 Github TrendingV2EX 热榜等常见公开信源)
三点:
**1. AI 版 RSS 阅读器**
**2. 实时 “新闻” 知识库**
**3. 帮你时刻关注 “指定事件” 的秘书(如 “关税政策变化”“xx 股票波动”)**
开箱即用的公共服务站https://zenfeed.xyz (集成 Hacker NewsGithub TrendingV2EX 热榜等常见公开信源)
> 总结模型以更新至 Gemini 2.5pro!!
豆包机器人上架中!
加入下方👇🏻微信群关注更新
## 前言
@@ -142,8 +155,6 @@ $env:API_KEY = "硅基流动apikey"; docker-compose -p zenfeed up -d
* 支持 Webhook 通知
* 爬虫
> 进展会第一时间在 [Linux Do](https://linux.do/u/ajd/summary) 更新
## 有任何问题与反馈,欢迎加群讨论
<img src="docs/images/wechat.png" alt="Wechat" width="150">

View File

@@ -44,7 +44,7 @@
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :----------------------- | :-------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :---------------------------------- |
| `scrape.past` | `time.Duration` | 抓取 Feed 的回溯时间窗口。例如 `1h` 表示只抓取过去 1 小时的 Feed。 | `3d` | 否 |
| `scrape.past` | `time.Duration` | 抓取 Feed 的回溯时间窗口。例如 `1h` 表示只抓取过去 1 小时的 Feed。 | `24h` | 否 |
| `scrape.interval` | `time.Duration` | 抓取每个源的频率 (全局默认值)。例如 `1h`。 | `1h` | 否 |
| `scrape.rsshub_endpoint` | `string` | RSSHub 的端点。你可以部署自己的 RSSHub 服务器或使用公共实例 (参见 [RSSHub 文档](https://docs.rsshub.app/guide/instances))。例如 `https://rsshub.app`。 | | 是 (如果使用了 `rsshub_route_path`) |
| `scrape.sources` | `对象列表` | 用于抓取 Feed 的源列表。详见下方的 **抓取源配置**。 | `[]` | 是 (至少一个) |

View File

@@ -44,7 +44,7 @@ This section defines a list of available Large Language Models. At least one LLM
| Field | Type | Description | Default | Required |
| :----------------------- | :-------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------ | :-------------------------------- |
| `scrape.past` | duration | The lookback time window for scraping feeds. e.g. `1h` means only scrape feeds in the past 1 hour. | `3d` | No |
| `scrape.past` | duration | The lookback time window for scraping feeds. e.g. `1h` means only scrape feeds in the past 1 hour. | `24h` | No |
| `scrape.interval` | duration | How often to scrape each source (global default). e.g. `1h`. | `1h` | No |
| `scrape.rsshub_endpoint` | string | The endpoint of the RSSHub. You can deploy your own or use a public one (see [RSSHub Docs](https://docs.rsshub.app/guide/instances)). e.g. `https://rsshub.app`. | | Yes (if `rsshub_route_path` used) |
| `scrape.sources` | list of objects | The sources for scraping feeds. See **Scrape Source Configuration** below. | `[]` | Yes (at least one) |

BIN
docs/images/crad.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 617 KiB

View File

@@ -30,6 +30,7 @@ import (
"github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
timeutil "github.com/glidea/zenfeed/pkg/util/time"
)
// --- Interface code block ---
@@ -83,10 +84,10 @@ type LLM struct {
}
type Scrape struct {
Past time.Duration `yaml:"past,omitempty" json:"past,omitempty" desc:"The lookback time window for scraping feeds. e.g. 1h means only scrape feeds in the past 1 hour. Default: 3d"`
Interval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape each source, it is a global interval. e.g. 1h. Default: 1h"`
RSSHubEndpoint string `yaml:"rsshub_endpoint,omitempty" json:"rsshub_endpoint,omitempty" desc:"The endpoint of the RSSHub. You can deploy your own RSSHub server or use the public one (https://docs.rsshub.app/guide/instances). e.g. https://rsshub.app. It is required when sources[].rss.rsshub_route_path is set."`
Sources []ScrapeSource `yaml:"sources,omitempty" json:"sources,omitempty" desc:"The sources for scraping feeds."`
Past timeutil.Duration `yaml:"past,omitempty" json:"past,omitempty" desc:"The lookback time window for scraping feeds. e.g. 1h means only scrape feeds in the past 1 hour. Default: 3d"`
Interval timeutil.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape each source, it is a global interval. e.g. 1h. Default: 1h"`
RSSHubEndpoint string `yaml:"rsshub_endpoint,omitempty" json:"rsshub_endpoint,omitempty" desc:"The endpoint of the RSSHub. You can deploy your own RSSHub server or use the public one (https://docs.rsshub.app/guide/instances). e.g. https://rsshub.app. It is required when sources[].rss.rsshub_route_path is set."`
Sources []ScrapeSource `yaml:"sources,omitempty" json:"sources,omitempty" desc:"The sources for scraping feeds."`
}
type Storage struct {
@@ -95,15 +96,15 @@ type Storage struct {
}
type FeedStorage struct {
Rewrites []RewriteRule `yaml:"rewrites,omitempty" json:"rewrites,omitempty" desc:"How to process each feed before storing it. It inspired by Prometheus relabeling (https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config), this implements a very strong flexibility and loose coupling."`
FlushInterval time.Duration `yaml:"flush_interval,omitempty" json:"flush_interval,omitempty" desc:"How often to flush the feed storage to the database, higher value will cause high data loss risk, but on the other hand, it will reduce the number of disk operations and improve performance. Default: 200ms"`
EmbeddingLLM string `yaml:"embedding_llm,omitempty" json:"embedding_llm,omitempty" desc:"The embedding LLM for the feed storage. It will significantly affect the accuracy of semantic search, please be careful to choose. If you want to switch, please note to keep the old llm configuration, because the past data is still implicitly associated with it, otherwise it will cause the past data to be unable to be semantically searched. Default is the default LLM in llms section."`
Retention time.Duration `yaml:"retention,omitempty" json:"retention,omitempty" desc:"How long to keep a feed. Default: 8d"`
BlockDuration time.Duration `yaml:"block_duration,omitempty" json:"block_duration,omitempty" desc:"How long to keep the feed storage block. Block is time-based, like Prometheus TSDB Block. Default: 25h"`
Rewrites []RewriteRule `yaml:"rewrites,omitempty" json:"rewrites,omitempty" desc:"How to process each feed before storing it. It inspired by Prometheus relabeling (https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config), this implements a very strong flexibility and loose coupling."`
FlushInterval timeutil.Duration `yaml:"flush_interval,omitempty" json:"flush_interval,omitempty" desc:"How often to flush the feed storage to the database, higher value will cause high data loss risk, but on the other hand, it will reduce the number of disk operations and improve performance. Default: 200ms"`
EmbeddingLLM string `yaml:"embedding_llm,omitempty" json:"embedding_llm,omitempty" desc:"The embedding LLM for the feed storage. It will significantly affect the accuracy of semantic search, please be careful to choose. If you want to switch, please note to keep the old llm configuration, because the past data is still implicitly associated with it, otherwise it will cause the past data to be unable to be semantically searched. Default is the default LLM in llms section."`
Retention timeutil.Duration `yaml:"retention,omitempty" json:"retention,omitempty" desc:"How long to keep a feed. Default: 8d"`
BlockDuration timeutil.Duration `yaml:"block_duration,omitempty" json:"block_duration,omitempty" desc:"How long to keep the feed storage block. Block is time-based, like Prometheus TSDB Block. Default: 25h"`
}
type ScrapeSource struct {
Interval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape this source. Default: global interval"`
Interval timeutil.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape this source. Default: global interval"`
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the source. It is required."`
Labels map[string]string `yaml:"labels,omitempty" json:"labels,omitempty" desc:"The additional labels to add to the feed of this source."`
RSS *ScrapeSourceRSS `yaml:"rss,omitempty" json:"rss,omitempty" desc:"The RSS config of the source."`
@@ -134,12 +135,12 @@ type RewriteRuleTransformToText struct {
}
type SchedulsRule struct {
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the rule. It is required."`
Query string `yaml:"query,omitempty" json:"query,omitempty" desc:"The semantic query to get the feeds. NOTE it is optional"`
Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty" desc:"The threshold to filter the query result by relevance (with 'query') score. It does not work when query is not set. Default is 0.6."`
LabelFilters []string `yaml:"label_filters,omitempty" json:"label_filters,omitempty" desc:"The label filters (equal or not equal) to match the feeds. e.g. [category=tech, source!=github]"`
EveryDay string `yaml:"every_day,omitempty" json:"every_day,omitempty" desc:"The query range at the end time of every day. Format: start~end, e.g. 00:00~23:59, or -22:00~7:00 (yesterday 22:00 to today 07:00)."`
WatchInterval time.Duration `yaml:"watch_interval,omitempty" json:"watch_interval,omitempty" desc:"The run and query interval to watch the rule. Default is 10m. It can not be set with every_day at same time."`
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the rule. It is required."`
Query string `yaml:"query,omitempty" json:"query,omitempty" desc:"The semantic query to get the feeds. NOTE it is optional"`
Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty" desc:"The threshold to filter the query result by relevance (with 'query') score. It does not work when query is not set. Default is 0.6."`
LabelFilters []string `yaml:"label_filters,omitempty" json:"label_filters,omitempty" desc:"The label filters (equal or not equal) to match the feeds. e.g. [category=tech, source!=github]"`
EveryDay string `yaml:"every_day,omitempty" json:"every_day,omitempty" desc:"The query range at the end time of every day. Format: start~end, e.g. 00:00~23:59, or -22:00~7:00 (yesterday 22:00 to today 07:00)."`
WatchInterval timeutil.Duration `yaml:"watch_interval,omitempty" json:"watch_interval,omitempty" desc:"The run and query interval to watch the rule. Default is 10m. It can not be set with every_day at same time."`
}
type NotifyRoute struct {

View File

@@ -275,7 +275,12 @@ func (r *router) Route(ctx context.Context, result *rule.Result) (groups []*Grou
return groups, nil
}
func (r *router) generateSummary(ctx context.Context, prompt string, feeds []*Feed, sourceLabel string) (string, error) {
func (r *router) generateSummary(
ctx context.Context,
prompt string,
feeds []*Feed,
sourceLabel string,
) (string, error) {
content := r.parseContentToSummary(feeds, sourceLabel)
if content == "" {
return "", nil
@@ -296,6 +301,7 @@ func (r *router) generateSummary(ctx context.Context, prompt string, feeds []*Fe
func (r *router) parseContentToSummary(feeds []*Feed, sourceLabel string) string {
if sourceLabel == "" {
b := runtimeutil.Must1(json.Marshal(feeds))
return string(b)
}

View File

@@ -59,7 +59,7 @@ func (c *Config) From(app *config.App) *Config {
Threshold: r.Threshold,
LabelFilters: r.LabelFilters,
EveryDay: r.EveryDay,
WatchInterval: r.WatchInterval,
WatchInterval: time.Duration(r.WatchInterval),
}
}

View File

@@ -66,14 +66,14 @@ func (c *Config) From(app *config.App) {
c.Scrapers = make([]scraper.Config, len(app.Scrape.Sources))
for i := range app.Scrape.Sources {
c.Scrapers[i] = scraper.Config{
Past: app.Scrape.Past,
Past: time.Duration(app.Scrape.Past),
Name: app.Scrape.Sources[i].Name,
Interval: app.Scrape.Sources[i].Interval,
Interval: time.Duration(app.Scrape.Sources[i].Interval),
Labels: model.Labels{},
}
c.Scrapers[i].Labels.FromMap(app.Scrape.Sources[i].Labels)
if c.Scrapers[i].Interval <= 0 {
c.Scrapers[i].Interval = app.Scrape.Interval
c.Scrapers[i].Interval = time.Duration(app.Scrape.Interval)
}
if app.Scrape.Sources[i].RSS != nil {
c.Scrapers[i].RSS = &scraper.ScrapeSourceRSS{

View File

@@ -55,7 +55,7 @@ const maxPast = 15 * 24 * time.Hour
func (c *Config) Validate() error {
if c.Past <= 0 {
c.Past = 3 * timeutil.Day
c.Past = timeutil.Day
}
if c.Past > maxPast {
c.Past = maxPast
@@ -208,10 +208,11 @@ func (s *scraper) fillIDs(feeds []*model.Feed) []*model.Feed {
for _, feed := range feeds {
// We can not use the pub time to join the hash,
// because the pub time is dynamic for some sources.
//
// title may be changed for some sources... so...
source := feed.Labels.Get(model.LabelSource)
title := feed.Labels.Get(model.LabelTitle)
link := feed.Labels.Get(model.LabelLink)
feed.ID = hashutil.Sum64s([]string{source, title, link})
feed.ID = hashutil.Sum64s([]string{source, link})
}
return feeds

View File

@@ -22,6 +22,7 @@ import (
"reflect"
"strconv"
"sync"
"sync/atomic"
"time"
"github.com/benbjohnson/clock"
@@ -98,9 +99,9 @@ func (c *Config) Validate() error {
func (c *Config) From(app *config.App) {
*c = Config{
Dir: app.Storage.Dir,
Retention: app.Storage.Feed.Retention,
BlockDuration: app.Storage.Feed.BlockDuration,
FlushInterval: app.Storage.Feed.FlushInterval,
Retention: time.Duration(app.Storage.Feed.Retention),
BlockDuration: time.Duration(app.Storage.Feed.BlockDuration),
FlushInterval: time.Duration(app.Storage.Feed.FlushInterval),
EmbeddingLLM: app.Storage.Feed.EmbeddingLLM,
}
}
@@ -578,10 +579,14 @@ func (s *storage) blockDependencies() block.Dependencies {
}
func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Feed, error) {
rewritten := make([]*model.Feed, 0, len(feeds))
var wg sync.WaitGroup
var errs []error
var mu sync.Mutex
var (
rewritten = make([]*model.Feed, 0, len(feeds))
wg sync.WaitGroup
mu sync.Mutex
errs []error
dropped atomic.Int32
)
for _, item := range feeds { // TODO: Limit the concurrency & goroutine number.
wg.Add(1)
go func(item *model.Feed) {
@@ -596,6 +601,7 @@ func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Fe
}
if len(labels) == 0 {
log.Debug(ctx, "drop feed", "id", item.ID)
dropped.Add(1)
return // Drop empty labels.
}
@@ -607,10 +613,12 @@ func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Fe
}(item)
}
wg.Wait()
if allFailed := len(errs) == len(feeds); allFailed {
return nil, errs[0]
}
if len(errs) > 0 {
switch len(errs) {
case 0:
case len(feeds) - int(dropped.Load()):
return nil, errs[0] // All failed.
default:
log.Error(ctx, errors.Wrap(errs[0], "rewrite feeds"), "error_count", len(errs))
}

View File

@@ -17,11 +17,13 @@ package time
import (
"context"
"encoding/json"
"math/rand"
"time"
_ "time/tzdata"
"github.com/pkg/errors"
"gopkg.in/yaml.v3"
runtimeutil "github.com/glidea/zenfeed/pkg/util/runtime"
)
@@ -84,3 +86,60 @@ func Tick(ctx context.Context, d time.Duration, f func() error) error {
func Random(max time.Duration) time.Duration {
return time.Duration(rand.Int63n(int64(max)))
}
type Duration time.Duration
func (d Duration) String() string {
return time.Duration(d).String()
}
func (d Duration) MarshalJSON() ([]byte, error) {
return json.Marshal(d.String())
}
func (d *Duration) UnmarshalJSON(b []byte) error {
var v any
if err := json.Unmarshal(b, &v); err != nil {
return err
}
switch tv := v.(type) {
case float64:
*d = Duration(time.Duration(tv))
return nil
case string:
parsed, err := time.ParseDuration(tv)
if err != nil {
return err
}
*d = Duration(parsed)
return nil
default:
return errors.Errorf("invalid duration: %v", tv)
}
}
func (d Duration) MarshalYAML() (interface{}, error) {
return d.String(), nil
}
func (d *Duration) UnmarshalYAML(value *yaml.Node) error {
if value.Kind != yaml.ScalarNode {
return errors.Errorf("invalid duration: expected a scalar node, got %v", value.Kind)
}
s := value.Value
parsed, err := time.ParseDuration(s)
if err != nil {
return errors.Errorf("failed to parse duration string '%s' from YAML: %s", s, err.Error())
}
*d = Duration(parsed)
return nil
}