Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
278cb662de | ||
|
|
8f32e427d4 | ||
|
|
3049c49f7a | ||
|
|
14a4f2b8d4 | ||
|
|
6a869574fc | ||
|
|
c581cbacda | ||
|
|
e7fe17a4bc | ||
|
|
b35aaa3b68 | ||
|
|
be83967168 | ||
|
|
064bca1dda |
19
README.md
19
README.md
@@ -1,8 +1,21 @@
|
||||
[English](README-en.md)
|
||||
|
||||
zenfeed:用 AI 赋能 RSS,自动为你筛选、总结、推送重要信息,告别信息过载,重拾阅读掌控感。
|
||||

|
||||
|
||||
开箱即用的公共服务站:https://zenfeed.xyz (集成 Github Trending,V2EX 热榜等常见公开信源)
|
||||
三点:
|
||||
|
||||
**1. AI 版 RSS 阅读器**
|
||||
|
||||
**2. 实时 “新闻” 知识库**
|
||||
|
||||
**3. 帮你时刻关注 “指定事件” 的秘书(如 “关税政策变化”,“xx 股票波动”)**
|
||||
|
||||
开箱即用的公共服务站:https://zenfeed.xyz (集成 Hacker News,Github Trending,V2EX 热榜等常见公开信源)
|
||||
> 总结模型以更新至 Gemini 2.5pro!!
|
||||
|
||||
豆包机器人上架中!
|
||||
|
||||
加入下方👇🏻微信群关注更新
|
||||
|
||||
## 前言
|
||||
|
||||
@@ -142,8 +155,6 @@ $env:API_KEY = "硅基流动apikey"; docker-compose -p zenfeed up -d
|
||||
* 支持 Webhook 通知
|
||||
* 爬虫
|
||||
|
||||
> 进展会第一时间在 [Linux Do](https://linux.do/u/ajd/summary) 更新
|
||||
|
||||
## 有任何问题与反馈,欢迎加群讨论
|
||||
|
||||
<img src="docs/images/wechat.png" alt="Wechat" width="150">
|
||||
|
||||
@@ -44,7 +44,7 @@
|
||||
|
||||
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
|
||||
| :----------------------- | :-------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :---------------------------------- |
|
||||
| `scrape.past` | `time.Duration` | 抓取 Feed 的回溯时间窗口。例如 `1h` 表示只抓取过去 1 小时的 Feed。 | `3d` | 否 |
|
||||
| `scrape.past` | `time.Duration` | 抓取 Feed 的回溯时间窗口。例如 `1h` 表示只抓取过去 1 小时的 Feed。 | `24h` | 否 |
|
||||
| `scrape.interval` | `time.Duration` | 抓取每个源的频率 (全局默认值)。例如 `1h`。 | `1h` | 否 |
|
||||
| `scrape.rsshub_endpoint` | `string` | RSSHub 的端点。你可以部署自己的 RSSHub 服务器或使用公共实例 (参见 [RSSHub 文档](https://docs.rsshub.app/guide/instances))。例如 `https://rsshub.app`。 | | 是 (如果使用了 `rsshub_route_path`) |
|
||||
| `scrape.sources` | `对象列表` | 用于抓取 Feed 的源列表。详见下方的 **抓取源配置**。 | `[]` | 是 (至少一个) |
|
||||
|
||||
@@ -44,7 +44,7 @@ This section defines a list of available Large Language Models. At least one LLM
|
||||
|
||||
| Field | Type | Description | Default | Required |
|
||||
| :----------------------- | :-------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------ | :-------------------------------- |
|
||||
| `scrape.past` | duration | The lookback time window for scraping feeds. e.g. `1h` means only scrape feeds in the past 1 hour. | `3d` | No |
|
||||
| `scrape.past` | duration | The lookback time window for scraping feeds. e.g. `1h` means only scrape feeds in the past 1 hour. | `24h` | No |
|
||||
| `scrape.interval` | duration | How often to scrape each source (global default). e.g. `1h`. | `1h` | No |
|
||||
| `scrape.rsshub_endpoint` | string | The endpoint of the RSSHub. You can deploy your own or use a public one (see [RSSHub Docs](https://docs.rsshub.app/guide/instances)). e.g. `https://rsshub.app`. | | Yes (if `rsshub_route_path` used) |
|
||||
| `scrape.sources` | list of objects | The sources for scraping feeds. See **Scrape Source Configuration** below. | `[]` | Yes (at least one) |
|
||||
|
||||
BIN
docs/images/crad.png
Normal file
BIN
docs/images/crad.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 617 KiB |
@@ -30,6 +30,7 @@ import (
|
||||
"github.com/glidea/zenfeed/pkg/telemetry"
|
||||
"github.com/glidea/zenfeed/pkg/telemetry/log"
|
||||
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
|
||||
timeutil "github.com/glidea/zenfeed/pkg/util/time"
|
||||
)
|
||||
|
||||
// --- Interface code block ---
|
||||
@@ -83,10 +84,10 @@ type LLM struct {
|
||||
}
|
||||
|
||||
type Scrape struct {
|
||||
Past time.Duration `yaml:"past,omitempty" json:"past,omitempty" desc:"The lookback time window for scraping feeds. e.g. 1h means only scrape feeds in the past 1 hour. Default: 3d"`
|
||||
Interval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape each source, it is a global interval. e.g. 1h. Default: 1h"`
|
||||
RSSHubEndpoint string `yaml:"rsshub_endpoint,omitempty" json:"rsshub_endpoint,omitempty" desc:"The endpoint of the RSSHub. You can deploy your own RSSHub server or use the public one (https://docs.rsshub.app/guide/instances). e.g. https://rsshub.app. It is required when sources[].rss.rsshub_route_path is set."`
|
||||
Sources []ScrapeSource `yaml:"sources,omitempty" json:"sources,omitempty" desc:"The sources for scraping feeds."`
|
||||
Past timeutil.Duration `yaml:"past,omitempty" json:"past,omitempty" desc:"The lookback time window for scraping feeds. e.g. 1h means only scrape feeds in the past 1 hour. Default: 3d"`
|
||||
Interval timeutil.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape each source, it is a global interval. e.g. 1h. Default: 1h"`
|
||||
RSSHubEndpoint string `yaml:"rsshub_endpoint,omitempty" json:"rsshub_endpoint,omitempty" desc:"The endpoint of the RSSHub. You can deploy your own RSSHub server or use the public one (https://docs.rsshub.app/guide/instances). e.g. https://rsshub.app. It is required when sources[].rss.rsshub_route_path is set."`
|
||||
Sources []ScrapeSource `yaml:"sources,omitempty" json:"sources,omitempty" desc:"The sources for scraping feeds."`
|
||||
}
|
||||
|
||||
type Storage struct {
|
||||
@@ -95,15 +96,15 @@ type Storage struct {
|
||||
}
|
||||
|
||||
type FeedStorage struct {
|
||||
Rewrites []RewriteRule `yaml:"rewrites,omitempty" json:"rewrites,omitempty" desc:"How to process each feed before storing it. It inspired by Prometheus relabeling (https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config), this implements a very strong flexibility and loose coupling."`
|
||||
FlushInterval time.Duration `yaml:"flush_interval,omitempty" json:"flush_interval,omitempty" desc:"How often to flush the feed storage to the database, higher value will cause high data loss risk, but on the other hand, it will reduce the number of disk operations and improve performance. Default: 200ms"`
|
||||
EmbeddingLLM string `yaml:"embedding_llm,omitempty" json:"embedding_llm,omitempty" desc:"The embedding LLM for the feed storage. It will significantly affect the accuracy of semantic search, please be careful to choose. If you want to switch, please note to keep the old llm configuration, because the past data is still implicitly associated with it, otherwise it will cause the past data to be unable to be semantically searched. Default is the default LLM in llms section."`
|
||||
Retention time.Duration `yaml:"retention,omitempty" json:"retention,omitempty" desc:"How long to keep a feed. Default: 8d"`
|
||||
BlockDuration time.Duration `yaml:"block_duration,omitempty" json:"block_duration,omitempty" desc:"How long to keep the feed storage block. Block is time-based, like Prometheus TSDB Block. Default: 25h"`
|
||||
Rewrites []RewriteRule `yaml:"rewrites,omitempty" json:"rewrites,omitempty" desc:"How to process each feed before storing it. It inspired by Prometheus relabeling (https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config), this implements a very strong flexibility and loose coupling."`
|
||||
FlushInterval timeutil.Duration `yaml:"flush_interval,omitempty" json:"flush_interval,omitempty" desc:"How often to flush the feed storage to the database, higher value will cause high data loss risk, but on the other hand, it will reduce the number of disk operations and improve performance. Default: 200ms"`
|
||||
EmbeddingLLM string `yaml:"embedding_llm,omitempty" json:"embedding_llm,omitempty" desc:"The embedding LLM for the feed storage. It will significantly affect the accuracy of semantic search, please be careful to choose. If you want to switch, please note to keep the old llm configuration, because the past data is still implicitly associated with it, otherwise it will cause the past data to be unable to be semantically searched. Default is the default LLM in llms section."`
|
||||
Retention timeutil.Duration `yaml:"retention,omitempty" json:"retention,omitempty" desc:"How long to keep a feed. Default: 8d"`
|
||||
BlockDuration timeutil.Duration `yaml:"block_duration,omitempty" json:"block_duration,omitempty" desc:"How long to keep the feed storage block. Block is time-based, like Prometheus TSDB Block. Default: 25h"`
|
||||
}
|
||||
|
||||
type ScrapeSource struct {
|
||||
Interval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape this source. Default: global interval"`
|
||||
Interval timeutil.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape this source. Default: global interval"`
|
||||
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the source. It is required."`
|
||||
Labels map[string]string `yaml:"labels,omitempty" json:"labels,omitempty" desc:"The additional labels to add to the feed of this source."`
|
||||
RSS *ScrapeSourceRSS `yaml:"rss,omitempty" json:"rss,omitempty" desc:"The RSS config of the source."`
|
||||
@@ -134,12 +135,12 @@ type RewriteRuleTransformToText struct {
|
||||
}
|
||||
|
||||
type SchedulsRule struct {
|
||||
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the rule. It is required."`
|
||||
Query string `yaml:"query,omitempty" json:"query,omitempty" desc:"The semantic query to get the feeds. NOTE it is optional"`
|
||||
Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty" desc:"The threshold to filter the query result by relevance (with 'query') score. It does not work when query is not set. Default is 0.6."`
|
||||
LabelFilters []string `yaml:"label_filters,omitempty" json:"label_filters,omitempty" desc:"The label filters (equal or not equal) to match the feeds. e.g. [category=tech, source!=github]"`
|
||||
EveryDay string `yaml:"every_day,omitempty" json:"every_day,omitempty" desc:"The query range at the end time of every day. Format: start~end, e.g. 00:00~23:59, or -22:00~7:00 (yesterday 22:00 to today 07:00)."`
|
||||
WatchInterval time.Duration `yaml:"watch_interval,omitempty" json:"watch_interval,omitempty" desc:"The run and query interval to watch the rule. Default is 10m. It can not be set with every_day at same time."`
|
||||
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the rule. It is required."`
|
||||
Query string `yaml:"query,omitempty" json:"query,omitempty" desc:"The semantic query to get the feeds. NOTE it is optional"`
|
||||
Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty" desc:"The threshold to filter the query result by relevance (with 'query') score. It does not work when query is not set. Default is 0.6."`
|
||||
LabelFilters []string `yaml:"label_filters,omitempty" json:"label_filters,omitempty" desc:"The label filters (equal or not equal) to match the feeds. e.g. [category=tech, source!=github]"`
|
||||
EveryDay string `yaml:"every_day,omitempty" json:"every_day,omitempty" desc:"The query range at the end time of every day. Format: start~end, e.g. 00:00~23:59, or -22:00~7:00 (yesterday 22:00 to today 07:00)."`
|
||||
WatchInterval timeutil.Duration `yaml:"watch_interval,omitempty" json:"watch_interval,omitempty" desc:"The run and query interval to watch the rule. Default is 10m. It can not be set with every_day at same time."`
|
||||
}
|
||||
|
||||
type NotifyRoute struct {
|
||||
|
||||
@@ -275,7 +275,12 @@ func (r *router) Route(ctx context.Context, result *rule.Result) (groups []*Grou
|
||||
return groups, nil
|
||||
}
|
||||
|
||||
func (r *router) generateSummary(ctx context.Context, prompt string, feeds []*Feed, sourceLabel string) (string, error) {
|
||||
func (r *router) generateSummary(
|
||||
ctx context.Context,
|
||||
prompt string,
|
||||
feeds []*Feed,
|
||||
sourceLabel string,
|
||||
) (string, error) {
|
||||
content := r.parseContentToSummary(feeds, sourceLabel)
|
||||
if content == "" {
|
||||
return "", nil
|
||||
@@ -296,6 +301,7 @@ func (r *router) generateSummary(ctx context.Context, prompt string, feeds []*Fe
|
||||
func (r *router) parseContentToSummary(feeds []*Feed, sourceLabel string) string {
|
||||
if sourceLabel == "" {
|
||||
b := runtimeutil.Must1(json.Marshal(feeds))
|
||||
|
||||
return string(b)
|
||||
}
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@ func (c *Config) From(app *config.App) *Config {
|
||||
Threshold: r.Threshold,
|
||||
LabelFilters: r.LabelFilters,
|
||||
EveryDay: r.EveryDay,
|
||||
WatchInterval: r.WatchInterval,
|
||||
WatchInterval: time.Duration(r.WatchInterval),
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -66,14 +66,14 @@ func (c *Config) From(app *config.App) {
|
||||
c.Scrapers = make([]scraper.Config, len(app.Scrape.Sources))
|
||||
for i := range app.Scrape.Sources {
|
||||
c.Scrapers[i] = scraper.Config{
|
||||
Past: app.Scrape.Past,
|
||||
Past: time.Duration(app.Scrape.Past),
|
||||
Name: app.Scrape.Sources[i].Name,
|
||||
Interval: app.Scrape.Sources[i].Interval,
|
||||
Interval: time.Duration(app.Scrape.Sources[i].Interval),
|
||||
Labels: model.Labels{},
|
||||
}
|
||||
c.Scrapers[i].Labels.FromMap(app.Scrape.Sources[i].Labels)
|
||||
if c.Scrapers[i].Interval <= 0 {
|
||||
c.Scrapers[i].Interval = app.Scrape.Interval
|
||||
c.Scrapers[i].Interval = time.Duration(app.Scrape.Interval)
|
||||
}
|
||||
if app.Scrape.Sources[i].RSS != nil {
|
||||
c.Scrapers[i].RSS = &scraper.ScrapeSourceRSS{
|
||||
|
||||
@@ -55,7 +55,7 @@ const maxPast = 15 * 24 * time.Hour
|
||||
|
||||
func (c *Config) Validate() error {
|
||||
if c.Past <= 0 {
|
||||
c.Past = 3 * timeutil.Day
|
||||
c.Past = timeutil.Day
|
||||
}
|
||||
if c.Past > maxPast {
|
||||
c.Past = maxPast
|
||||
@@ -208,10 +208,11 @@ func (s *scraper) fillIDs(feeds []*model.Feed) []*model.Feed {
|
||||
for _, feed := range feeds {
|
||||
// We can not use the pub time to join the hash,
|
||||
// because the pub time is dynamic for some sources.
|
||||
//
|
||||
// title may be changed for some sources... so...
|
||||
source := feed.Labels.Get(model.LabelSource)
|
||||
title := feed.Labels.Get(model.LabelTitle)
|
||||
link := feed.Labels.Get(model.LabelLink)
|
||||
feed.ID = hashutil.Sum64s([]string{source, title, link})
|
||||
feed.ID = hashutil.Sum64s([]string{source, link})
|
||||
}
|
||||
|
||||
return feeds
|
||||
|
||||
@@ -22,6 +22,7 @@ import (
|
||||
"reflect"
|
||||
"strconv"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/benbjohnson/clock"
|
||||
@@ -98,9 +99,9 @@ func (c *Config) Validate() error {
|
||||
func (c *Config) From(app *config.App) {
|
||||
*c = Config{
|
||||
Dir: app.Storage.Dir,
|
||||
Retention: app.Storage.Feed.Retention,
|
||||
BlockDuration: app.Storage.Feed.BlockDuration,
|
||||
FlushInterval: app.Storage.Feed.FlushInterval,
|
||||
Retention: time.Duration(app.Storage.Feed.Retention),
|
||||
BlockDuration: time.Duration(app.Storage.Feed.BlockDuration),
|
||||
FlushInterval: time.Duration(app.Storage.Feed.FlushInterval),
|
||||
EmbeddingLLM: app.Storage.Feed.EmbeddingLLM,
|
||||
}
|
||||
}
|
||||
@@ -578,10 +579,14 @@ func (s *storage) blockDependencies() block.Dependencies {
|
||||
}
|
||||
|
||||
func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Feed, error) {
|
||||
rewritten := make([]*model.Feed, 0, len(feeds))
|
||||
var wg sync.WaitGroup
|
||||
var errs []error
|
||||
var mu sync.Mutex
|
||||
var (
|
||||
rewritten = make([]*model.Feed, 0, len(feeds))
|
||||
wg sync.WaitGroup
|
||||
mu sync.Mutex
|
||||
errs []error
|
||||
dropped atomic.Int32
|
||||
)
|
||||
|
||||
for _, item := range feeds { // TODO: Limit the concurrency & goroutine number.
|
||||
wg.Add(1)
|
||||
go func(item *model.Feed) {
|
||||
@@ -596,6 +601,7 @@ func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Fe
|
||||
}
|
||||
if len(labels) == 0 {
|
||||
log.Debug(ctx, "drop feed", "id", item.ID)
|
||||
dropped.Add(1)
|
||||
|
||||
return // Drop empty labels.
|
||||
}
|
||||
@@ -607,10 +613,12 @@ func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Fe
|
||||
}(item)
|
||||
}
|
||||
wg.Wait()
|
||||
if allFailed := len(errs) == len(feeds); allFailed {
|
||||
return nil, errs[0]
|
||||
}
|
||||
if len(errs) > 0 {
|
||||
|
||||
switch len(errs) {
|
||||
case 0:
|
||||
case len(feeds) - int(dropped.Load()):
|
||||
return nil, errs[0] // All failed.
|
||||
default:
|
||||
log.Error(ctx, errors.Wrap(errs[0], "rewrite feeds"), "error_count", len(errs))
|
||||
}
|
||||
|
||||
|
||||
@@ -17,11 +17,13 @@ package time
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"math/rand"
|
||||
"time"
|
||||
_ "time/tzdata"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"gopkg.in/yaml.v3"
|
||||
|
||||
runtimeutil "github.com/glidea/zenfeed/pkg/util/runtime"
|
||||
)
|
||||
@@ -84,3 +86,60 @@ func Tick(ctx context.Context, d time.Duration, f func() error) error {
|
||||
func Random(max time.Duration) time.Duration {
|
||||
return time.Duration(rand.Int63n(int64(max)))
|
||||
}
|
||||
|
||||
type Duration time.Duration
|
||||
|
||||
func (d Duration) String() string {
|
||||
return time.Duration(d).String()
|
||||
}
|
||||
|
||||
func (d Duration) MarshalJSON() ([]byte, error) {
|
||||
return json.Marshal(d.String())
|
||||
}
|
||||
|
||||
func (d *Duration) UnmarshalJSON(b []byte) error {
|
||||
var v any
|
||||
if err := json.Unmarshal(b, &v); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch tv := v.(type) {
|
||||
case float64:
|
||||
*d = Duration(time.Duration(tv))
|
||||
|
||||
return nil
|
||||
|
||||
case string:
|
||||
parsed, err := time.ParseDuration(tv)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
*d = Duration(parsed)
|
||||
|
||||
return nil
|
||||
|
||||
default:
|
||||
return errors.Errorf("invalid duration: %v", tv)
|
||||
}
|
||||
}
|
||||
|
||||
func (d Duration) MarshalYAML() (interface{}, error) {
|
||||
return d.String(), nil
|
||||
}
|
||||
|
||||
func (d *Duration) UnmarshalYAML(value *yaml.Node) error {
|
||||
if value.Kind != yaml.ScalarNode {
|
||||
return errors.Errorf("invalid duration: expected a scalar node, got %v", value.Kind)
|
||||
}
|
||||
|
||||
s := value.Value
|
||||
|
||||
parsed, err := time.ParseDuration(s)
|
||||
if err != nil {
|
||||
return errors.Errorf("failed to parse duration string '%s' from YAML: %s", s, err.Error())
|
||||
}
|
||||
|
||||
*d = Duration(parsed)
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user