88 Commits
v0.1.0 ... main

Author SHA1 Message Date
glidea
094600ee26 update README
Removed sponsorship section and updated images with links.
2025-11-22 14:59:24 +08:00
glidea
c03e4c8359 Merge pull request #31 from Twelveeee/main
feat:add RSSHub RSSHubAccessKey
2025-11-07 15:59:20 +08:00
Twelveeee
584f94e1ef feat:add RSSHub RSSHubAccessKey 2025-11-07 14:27:29 +08:00
Twelveeee
6c4223de92 feat:add RSSHub RSSHubAccessKey 2025-11-06 15:58:11 +08:00
Twelveeee
f67db8ea86 feat:add RSSHub RSSHubAccessKey 2025-11-06 11:06:26 +08:00
Twelveeee
bc54cc852e feat:add RSSHub RSSHubAccessKey 2025-11-05 14:55:01 +00:00
glidea
7cb8069d60 update README.md 2025-09-08 15:56:13 +08:00
glidea
87b84d94ff update README.md 2025-09-06 16:20:32 +08:00
glidea
4d29bae67f update README 2025-08-18 16:41:23 +08:00
glidea
d640e975bd handle empty response for gemini 2025-08-18 16:33:27 +08:00
glidea
e4bd0ca43b recommend Qwen/Qwen3-Embedding-4B by default 2025-07-24 10:14:09 +08:00
glidea
8b001c4cdf update image 2025-07-16 11:40:43 +08:00
glidea
6cacb47d3d update doc 2025-07-15 11:31:25 +08:00
glidea
a65d597032 update doc 2025-07-14 21:46:20 +08:00
glidea
151bd5f66f update sponsor 2025-07-14 21:32:43 +08:00
glidea
69a9545869 update doc 2025-07-14 18:12:17 +08:00
glidea
b01e07e348 fix doc 2025-07-14 12:28:52 +08:00
glidea
e92d7e322e allow empty config for object storage 2025-07-11 21:42:54 +08:00
glidea
7b4396067b fix ci 2025-07-09 21:47:23 +08:00
glidea
00c5dfadee add podcast 2025-07-09 17:28:26 +08:00
glidea
263fcbbfaf update docs 2025-07-02 10:51:45 +08:00
glidea
9783ef693f update rewrite-zh.md 2025-06-26 10:52:35 +08:00
glidea
2df7c120a6 fix docs 2025-06-24 08:39:39 +08:00
glidea
4ac4667ce9 fix typo 2025-06-11 21:37:24 +08:00
glidea
94ac06d9ac update docs 2025-06-10 21:48:05 +08:00
glidea
90148b2fcd update docs 2025-06-10 17:03:43 +08:00
glidea
0fc6d73b04 update docs 2025-06-09 20:32:33 +08:00
glidea
55a5a186b9 add desc of telemetry.address 2025-06-09 17:58:51 +08:00
glidea
53d4d40f9f update docs 2025-06-07 16:23:14 +08:00
glidea
c320beaff2 optimize prompt 2025-06-07 16:17:36 +08:00
glidea
a7e283a6f9 update docs 2025-06-07 16:17:36 +08:00
glidea
0a17fc1172 update docs 2025-06-07 16:17:36 +08:00
glidea
ae90606263 support attach labels to rule result 2025-06-07 16:17:36 +08:00
glidea
f4d3786496 add summary_html_snippet_for_small_model 2025-06-07 16:17:36 +08:00
glidea
8241a57d88 update README.md 2025-06-07 10:54:18 +08:00
glidea
20f2af6571 fix typo 2025-06-06 09:09:46 +08:00
glidea
a273f8ee4d update docs 2025-06-06 00:03:43 +08:00
glidea
ef2c44438c fix lint 2025-06-05 23:46:54 +08:00
glidea
d520444e9f add rss & crawl & webhook 2025-06-05 23:29:37 +08:00
glidea
ead8286a48 optimize docker compose yml 2025-06-05 23:27:05 +08:00
glidea
7bec054369 update gitignore 2025-06-05 23:25:20 +08:00
glidea
7ad9789a76 update docs 2025-06-05 22:34:30 +08:00
glidea
8ae4c678e9 update README.md 2025-06-01 12:46:18 +08:00
glidea
e9e6a112cc update docs 2025-05-19 20:59:40 +08:00
glidea
24f09e31a7 update docs 2025-05-19 20:58:40 +08:00
glidea
267fe0cd52 update docs 2025-05-19 20:56:11 +08:00
glidea
12f1559ac3 update docs 2025-05-17 23:03:59 +08:00
glidea
34dda291c8 update README 2025-05-17 20:26:45 +08:00
glidea
4922998bac add tech docs 2025-05-17 20:25:10 +08:00
glidea
a59a24dfce update README 2025-05-17 20:25:00 +08:00
glidea
91d99a6eea update README 2025-05-10 16:57:21 +08:00
glidea
ec27a94647 add query api doc 2025-05-08 16:32:12 +08:00
glidea
98228d7434 fix test 2025-05-06 16:07:22 +08:00
glidea
278cb662de marshal time.Duration as json string 2025-05-06 16:00:59 +08:00
glidea
8f32e427d4 update README 2025-05-06 11:31:39 +08:00
glidea
3049c49f7a fix dedup 2025-05-06 11:27:36 +08:00
glidea
14a4f2b8d4 fix rewrite error handing 2025-05-03 14:51:27 +08:00
glidea
6a869574fc update README 2025-05-02 11:38:58 +08:00
glidea
c581cbacda fix rewrite error handing 2025-05-01 19:19:34 +08:00
glidea
e7fe17a4bc update image 2025-04-30 20:16:28 +08:00
glidea
b35aaa3b68 update image 2025-04-30 20:13:25 +08:00
glidea
be83967168 update README 2025-04-30 11:41:44 +08:00
glidea
064bca1dda fix lint 2025-04-29 08:22:03 +08:00
glidea
ab05089ec6 update README 2025-04-28 23:32:42 +08:00
glidea
b15c52a8c7 update README 2025-04-28 23:30:19 +08:00
glidea
18cc247532 add summary for notification & misc fix 2025-04-28 23:29:34 +08:00
glidea
98837b7d6d optimize docker compose 2025-04-28 23:26:09 +08:00
glidea
dca095f41c update README 2025-04-26 13:06:49 +08:00
glidea
92bde40ef0 update README 2025-04-26 12:35:02 +08:00
glidea
fea0bfa88d update README 2025-04-25 11:20:39 +08:00
glidea
9f9044b078 update README 2025-04-25 11:02:26 +08:00
glidea
6ee9517b31 update README 2025-04-24 21:22:26 +08:00
glidea
b6f81a3ad6 remove legacy tests 2025-04-24 18:53:13 +08:00
glidea
eb788dc738 update README 2025-04-24 13:08:18 +08:00
glidea
9b5aee1ed7 update README 2025-04-24 08:57:10 +08:00
glidea
185cb2fba5 add English doc 2025-04-23 20:58:46 +08:00
glidea
ddf284be0a update README 2025-04-23 20:19:26 +08:00
glidea
57ea6e681e correction for commit 170703ce80: v0.1.1. OMG I need to sleep!! 2025-04-23 19:55:56 +08:00
glidea
170703ce80 correction for commit 9026a08298: v0.1.0 2025-04-23 19:48:57 +08:00
glidea
396ee45e8c fix docker image label 2025-04-23 19:47:41 +08:00
glidea
9026a08298 v0.1.0 2025-04-23 19:15:31 +08:00
glidea
57112b916b adapting to RSSHub's incompatible changes 2025-04-23 11:43:36 +08:00
glidea
2896dfa108 lock rsshub version 2025-04-23 11:42:44 +08:00
glidea
b76fc32c9a add issue-translator 2025-04-23 10:34:49 +08:00
glidea
668f6da981 update README 2025-04-22 15:24:55 +08:00
glidea
b45fd547ab optimize deploy 2025-04-22 14:59:41 +08:00
glidea
afc15b1ed6 fix install cmd for Linux 2025-04-21 22:35:09 +08:00
glidea
0f3b282c12 remove used render.sh 2025-04-20 12:08:58 +08:00
84 changed files with 5040 additions and 1704 deletions

1
.github/FUNDING.yml vendored Normal file
View File

@@ -0,0 +1 @@
custom: https://afdian.com/a/glidea

View File

@@ -2,7 +2,7 @@ name: CI
on:
push:
branches: [ main ]
branches: [ main, dev ]
pull_request:
branches: [ main ]
release:
@@ -27,7 +27,7 @@ jobs:
build-and-push:
runs-on: ubuntu-latest
needs: test
if: github.event_name == 'release'
if: github.event_name == 'release' || (github.event_name == 'push' && github.ref_name == 'dev')
steps:
- uses: actions/checkout@v4
- name: Set up Docker Buildx
@@ -37,5 +37,9 @@ jobs:
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}
- name: Build and push Docker images
run: make push
- name: Build and push Docker image (main)
if: github.event_name == 'release'
run: make push
- name: Build and push Docker image (dev)
if: github.ref_name == 'dev'
run: make dev-push

14
.github/workflows/issue-translator.yml vendored Normal file
View File

@@ -0,0 +1,14 @@
name: 'issue-translator'
on:
issue_comment:
types: [created]
issues:
types: [opened]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: usthe/issues-translate-action@v2.7
with:
IS_MODIFY_TITLE: true

3
.gitignore vendored
View File

@@ -18,7 +18,8 @@ local_docs/
.env
.env.local
__debug_bin
config.yaml
config.*yaml
data/
*debug*
.cursorrules
.cursor/

View File

@@ -28,7 +28,6 @@ linters:
- importas
- inamedparam
- intrange
- lll
- maintidx
- nestif
- nlreturn

View File

@@ -10,10 +10,14 @@ RUN GOOS=linux go build -ldflags="-s -w -X main.version=${VERSION}" -o /app/zenf
FROM alpine:latest
ARG VERSION=dev
LABEL org.opencontainers.image.version=${VERSION}
RUN apk add --no-cache ca-certificates tzdata && \
mkdir -p /app/data
COPY --from=builder /app/zenfeed /app/
WORKDIR /app
ENTRYPOINT ["/app/zenfeed"]
CMD ["--config", "/app/config/config.yaml"]

View File

@@ -4,7 +4,7 @@ REGISTRY ?= glidea
FULL_IMAGE_NAME = $(REGISTRY)/$(IMAGE_NAME)
.PHONY: test push build-installer
.PHONY: test push dev-push
test:
go test -race -v -coverprofile=coverage.out -coverpkg=./... ./...
@@ -16,3 +16,10 @@ push:
-t $(FULL_IMAGE_NAME):$(VERSION) \
-t $(FULL_IMAGE_NAME):latest \
--push .
dev-push:
docker buildx create --use --name multi-platform-builder || true
docker buildx build --platform linux/amd64,linux/arm64 \
--build-arg VERSION=$(VERSION) \
-t $(FULL_IMAGE_NAME):$(VERSION) \
--push .

260
README-en.md Normal file
View File

@@ -0,0 +1,260 @@
[中文](README.md)
<p align="center">
<img src="docs/images/crad.png" alt="zenfeed cover image">
</p>
<p align="center">
<a href="https://app.codacy.com/gh/glidea/zenfeed/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade"><img src="https://app.codacy.com/project/badge/Grade/1b51f1087558402d85496fbe7bddde89"/></a>
<a href="https://sonarcloud.io/summary/new_code?id=glidea_zenfeed"><img src="https://sonarcloud.io/api/project_badges/measure?project=glidea_zenfeed&metric=sqale_rating"/></a>
<a href="https://goreportcard.com/badge/github.com/glidea/zenfeed"><img src="https://goreportcard.com/badge/github.com/glidea/zenfeed"/></a>
<a href="https://deepwiki.com/glidea/zenfeed"><img src="https://deepwiki.com/badge.svg"/></a>
</p>
<h3 align="center">In the torrent of information (Feed), may you maintain your Zen.</h3>
<p align="center">
zenfeed is your <strong>AI information hub</strong>. It's an intelligent RSS reader, a real-time "news" knowledge base, and a personal secretary that helps you monitor "specific events" and delivers analysis reports.
</p>
<p align="center">
<a href="https://zenfeed.xyz"><b>Live Demo (RSS Reading Only)</b></a>
&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;
<a href="docs/tech/hld-en.md"><b>Technical Documentation</b></a>
&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;
<a href="#-installation-and-usage"><b>Quick Start</b></a>
</p>
> [!NOTE]
> The description on DeepWiki is not entirely accurate (and I cannot correct it), but the Q&A quality is decent.
---
**epub2rss**: Convert epub ebooks into RSS feeds that update with a chapter every day, [join waitlist](https://epub2rss.pages.dev/)
---
## 💡 Introduction
RSS (Really Simple Syndication) was born in the Web 1.0 era to solve the problem of information fragmentation, allowing users to aggregate and track updates from multiple websites in one place without frequent visits. It pushes website updates to subscribers in summary form for quick information retrieval.
However, with the rise of Web 2.0, social media, and algorithmic recommendations, RSS never became mainstream. The shutdown of Google Reader in 2013 was a landmark event. As Zhang Yiming (founder of ByteDance) pointed out at the time, RSS demands a lot from its users: strong information filtering skills and self-discipline to manage subscription sources, otherwise it's easy to get drowned in information noise. He believed that for most users, easier "personalized recommendations" were a better solution, which led to the creation of Toutiao and Douyin (TikTok).
Algorithmic recommendations have indeed lowered the barrier to accessing information, but their tendency to over-cater to human weaknesses often leads to filter bubbles and entertainment addiction. If you want to get truly valuable content from your information stream, you need even greater self-control to resist the algorithm's "feed."
So, is pure RSS subscription the answer? Not entirely. Information overload and the difficulty of filtering (information noise) are still major pain points for RSS users.
Confucius spoke of the "Doctrine of the Mean" in all things. Can we find a middle ground that allows us to enjoy the sense of control and high-quality sources from active RSS subscriptions while using technology to overcome the drawback of information overload?
Give zenfeed a try! **AI + RSS** might be a better way to consume information in this era. zenfeed aims to leverage the power of AI to help you automatically filter and summarize the information you care about, allowing you to maintain your Zen in the torrent of information (Feed).
> Reference Article: [AI Revives RSS? - sspai.com (Chinese)](https://sspai.com/post/89494)
---
## ✨ Features
![Zenfeed Architecture](docs/images/arch.png)
**For [RSS](https://en.wikipedia.org/wiki/RSS) Power Users** 🚗
* Your AI-powered RSS reader (use with [zenfeed-web](https://github.com/glidea/zenfeed-web))
* Can act as an [MCP](https://mcp.so/) Server for [RSSHub](https://github.com/DIYgod/RSSHub)
* Customize trusted RSS sources to build a lightning-fast personal AI search engine
* Similar in functionality to [Feedly AI](https://feedly.com/ai)
<details>
<summary><b>Preview</b></summary>
<br>
<img src="docs/images/feed-list-with-web.png" alt="Feed list" width="600">
<img src="docs/images/chat-with-feeds.png" alt="Chat with feeds" width="500">
</details>
**For Those Seeking an [Everything Tracker](https://www.wwzzai.com/) Alternative** 🔍
* Possesses powerful [information tracking capabilities](https://github.com/glidea/zenfeed/blob/main/docs/config.md#schedule-configuration-schedules) and emphasizes high-quality, customizable data sources.
* Can serve as an RSS version of [AI Chief Intelligence Officer](https://github.com/TeamWiseFlow/wiseflow?tab=readme-ov-file), but more flexible and closer to an engine.
<details>
<summary><b>Preview</b></summary>
<br>
<img src="docs/images/monitoring.png" alt="Monitoring setup" width="500">
<img src="docs/images/notification-with-web.png" alt="Notification example" width="500">
</details>
**For Those with Information Anxiety (like me)** 😌
* If you're tired of endlessly scrolling through feeds, try the briefing feature. Receive AI-powered briefings at a scheduled time each day for a comprehensive and efficient overview, eliminating the hidden costs of context switching. A bit of a renaissance feel, don't you think? ✨
* "zenfeed" is a combination of "zen" and "feed," meaning: in the torrent of information (feed), may you maintain your zen.
<details>
<summary><b>Preview</b></summary>
<br>
<img src="docs/images/daily-brief.png" alt="Daily brief example" width="500">
</details>
**For Developers** 🔬
* **Pipelined Processing**: Similar to Prometheus's [Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config), zenfeed abstracts each piece of content into a set of labels. At each stage of the pipeline, you can use custom prompts to process these labels (e.g., scoring, classifying, summarizing, filtering).
* **Flexible Orchestration**: Based on the processed labels, you can freely query, filter, [route](https://github.com/glidea/zenfeed/blob/main/docs/config.md#notification-routing-configuration-notifyroute-and-notifyroutesub_routes), and [send notifications](https://github.com/glidea/zenfeed/blob/main/docs/config.md#notification-channel-email-configuration-notifychannelsemail), giving zenfeed a highly tool-oriented and customizable nature. For details, see [Rewrite Rules](docs/tech/rewrite-en.md).
* **Open APIs**:
* [Query API](/docs/query-api-en.md)
* [RSS Exported API](/docs/rss-api-en.md)
* [Notify Webhook](/docs/webhook-en.md)
* [Extensive Declarative YAML Configuration](/docs/config.md)
<details>
<summary><b>Preview</b></summary>
<br>
<img src="docs/images/update-config-with-web.png" alt="Update config via web" width="500">
</details>
<p align="center">
<a href="docs/preview.md"><b>➡️ See More Previews</b></a>
</p>
---
## 🚀 Installation and Usage
### 1. Prerequisites
> [!IMPORTANT]
> zenfeed uses model services from [SiliconFlow](https://cloud.siliconflow.cn/en) by default.
> * Models: `Qwen/Qwen3-8B` (Free) and `Qwen/Qwen3-Embedding-4B`.
> * If you don't have a SiliconFlow account yet, use this [**invitation link**](https://cloud.siliconflow.cn/i/U2VS0Q5A) to get a **¥14** credit.
> * If you need to use other providers or models, or for more detailed custom deployments, please refer to the [Configuration Documentation](https://github.com/glidea/zenfeed/blob/main/docs/config.md) to edit `docker-compose.yml`.
### 2. One-Click Deployment
> Get the service up and running in as little as one minute.
#### Mac / Linux
```bash
# Download the configuration file
curl -L -O https://raw.githubusercontent.com/glidea/zenfeed/main/docker-compose.yml
# Start the service (replace with your API_KEY)
API_KEY="sk-..." docker-compose -p zenfeed up -d
```
#### Windows (PowerShell)
```powershell
# Download the configuration file
Invoke-WebRequest -Uri "https://raw.githubusercontent.com/glidea/zenfeed/main/docker-compose.yml" -OutFile "docker-compose.yml"
# Start the service (replace with your API_KEY)
$env:API_KEY = "sk-..."; docker-compose -p zenfeed up -d
```
🎉 **Deployment Complete!**
Access it at http://localhost:1400
> [!WARNING]
> * If you deploy zenfeed on a public server like a VPS, access it via `http://<YOUR_IP>:1400` and ensure that your firewall/security group allows traffic on port `1400`.
> * **Security Notice:** zenfeed does not yet have an authentication mechanism. Exposing the service to the public internet may leak your `API_KEY`. Be sure to configure strict security group rules to allow access only from trusted IPs.
### 3. Getting Started
#### Add RSS Subscription Feeds
<img src="docs/images/web-add-source.png" alt="Add RSS source via web" width="400">
> * To migrate from Follow, please refer to [migrate-from-follow-en.md](docs/migrate-from-follow-en.md).
> * After adding a source, zenfeed needs to access the origin site, so ensure your network is connected.
> * Please wait a few minutes after adding for content to be fetched and processed, especially if the model has strict rate limits.
#### Configure Daily Briefings, Monitoring, etc.
<img src="docs/images/notification-with-web.png" alt="Configure notifications via web" width="400">
#### Configure MCP (Optional)
For example, to configure MCP and connect to Zenfeed with Cherry Studio, see [Cherry Studio MCP](docs/cherry-studio-mcp-en.md).
> Default address `http://localhost:1301/sse`
#### More...
The web UI doesn't fully capture zenfeed's powerful flexibility. For more ways to play, please check the [Configuration Documentation](docs/config.md)
---
## 🗺️ Roadmap
We have some cool features planned. Check out our [Roadmap](/docs/roadmap-en.md) and feel free to share your suggestions!
---
## 💬 Community and Support
> **For usage questions, please prioritize opening an [Issue](https://github.com/glidea/zenfeed/issues).** This helps others with similar problems and allows for better tracking and resolution.
<table>
<tr>
<td align="center">
<img src="docs/images/wechat.png" alt="Wechat QR Code" width="150">
<br>
<strong>Join WeChat Group</strong>
</td>
<td align="center">
<img src="docs/images/sponsor.png" alt="Sponsor QR Code" width="150">
<br>
<strong>Buy Me a Coffee 🧋</strong>
</td>
</tr>
</table>
Since you've read this far, how about giving us a **Star ⭐️**? It's the biggest motivation for me to keep maintaining this project!
If you have any interesting AI job opportunities, please contact me!
---
## 🧩 Ecosystem Projects
### [Ruhang365 Daily](https://daily.ruhang365.com)
Founded in 2017, Ruhang365 aims to build a community for sharing expertise and growing together, starting with industry information exchange. It is dedicated to providing comprehensive career consulting, training, niche community interactions, and resource collaboration services for internet professionals.
*Experimental Content Sources (Updates Paused)*
* [V2EX](https://v2ex.analysis.zenfeed.xyz/)
* [LinuxDO](https://linuxdo.analysis.zenfeed.xyz/)
---
## 📝 Notes and Disclaimer
### Notes
* **Version Compatibility:** Backward compatibility for APIs and configurations is not guaranteed before version 1.0.
* **Open Source License:** The project uses the AGPLv3 license. Any forks or distributions must also remain open source.
* **Commercial Use:** Please contact the author to register for commercial use. Support can be provided within reasonable limits. We welcome legitimate commercial applications but discourage using this project for illicit activities.
* **Data Storage:** Data is not stored permanently; the default retention period is 8 days.
### Acknowledgements
* Thanks to [eryajf](https://github.com/eryajf) for the [Compose Inline Config](https://github.com/glidea/zenfeed/issues/1) suggestion, which makes deployment easier to understand.
* [![Powered by DartNode](https://dartnode.com/branding/DN-Open-Source-sm.png)](https://dartnode.com "Powered by DartNode - Free VPS for Open Source")
### Contributing
* The contribution guidelines are still a work in progress, but we adhere to one core principle: "Code Style Consistency."
### Disclaimer
<details>
<summary><strong>Click to expand for the full disclaimer</strong></summary>
**Before using the `zenfeed` software (hereinafter "the Software"), please read and understand this disclaimer carefully. By downloading, installing, using the Software or any related services, you acknowledge that you have read, understood, and agree to be bound by all the terms of this disclaimer. If you do not agree with any part of this disclaimer, please cease using the Software immediately.**
1. **"AS IS" BASIS:** The Software is provided on an "as is" and "as available" basis, without any warranties of any kind, either express or implied. The project authors and contributors make no representations or warranties regarding the Software's merchantability, fitness for a particular purpose, non-infringement, accuracy, completeness, reliability, security, timeliness, or performance.
2. **USER RESPONSIBILITY:** You are solely responsible for all your activities conducted through the Software. This includes, but is not limited to:
* **Data Source Selection:** You are responsible for selecting and configuring the data sources (e.g., RSS feeds, future potential Email sources) to be connected. You must ensure that you have the right to access and process the content from these sources and comply with their respective terms of service, copyright policies, and applicable laws and regulations.
* **Content Compliance:** You must not use the Software to process, store, or distribute any illegal, infringing, defamatory, obscene, or otherwise objectionable content.
* **API Key and Credential Security:** You are responsible for safeguarding any API keys, passwords, or other credentials you configure within the Software. The project authors and contributors are not liable for any loss or damage arising from your failure to do so.
* **Configuration and Use:** You are responsible for the correct configuration and use of the Software's features, including content processing pipelines, filtering rules, notification settings, etc.
3. **THIRD-PARTY CONTENT AND SERVICES:** The Software may integrate with or rely on third-party data sources and services (e.g., RSSHub, LLM providers, SMTP services). The project authors and contributors are not responsible for the availability, accuracy, legality, security, or terms of service of such third-party content or services. Your interactions with these third parties are governed by their respective terms and policies. The copyright of third-party content accessed or processed through the Software (including original articles, summaries, classifications, scores, etc.) belongs to the original rights holders. You are solely responsible for any legal liabilities that may arise from your use of such content.
4. **NO GUARANTEE OF PROCESSING ACCURACY:** The Software uses technologies like Large Language Models (LLMs) to process content (e.g., for summaries, classifications, scoring, filtering). These results may be inaccurate, incomplete, or biased. The project authors and contributors are not responsible for any decisions or actions taken based on these processing results. The accuracy of semantic search results is also affected by multiple factors and is not guaranteed.
5. **LIMITATION OF LIABILITY:** In no event shall the project authors or contributors be liable for any direct, indirect, incidental, special, exemplary, or consequential damages (including, but not limited to, procurement of substitute goods or services; loss of use, data, or profits; or business interruption) however caused and on any theory of liability, whether in contract, strict liability, or tort (including negligence or otherwise) arising in any way out of the use of this software, even if advised of the possibility of such damage.
6. **OPEN SOURCE SOFTWARE:** The Software is licensed under the AGPLv3 license. You are responsible for understanding and complying with the terms of this license.
7. **NOT LEGAL ADVICE:** This disclaimer does not constitute legal advice. If you have any questions about the legal implications of using the Software, you should consult with a qualified legal professional.
8. **MODIFICATION AND ACCEPTANCE:** The project authors reserve the right to modify this disclaimer at any time. Your continued use of the Software will be deemed acceptance of the modified terms.
**Please be aware: Crawling, processing, and distributing copyrighted content using the Software may carry legal risks. Users are responsible for ensuring that their use complies with all applicable laws, regulations, and third-party terms of service. The project authors and contributors assume no liability for any legal disputes or losses arising from the user's misuse or improper use of the Software.**
</details>

336
README.md
View File

@@ -1,183 +1,259 @@
## 项目介绍
[English](README-en.md)
[![Go Report Card](https://goreportcard.com/badge/github.com/glidea/zenfeed)](https://goreportcard.com/report/github.com/glidea/zenfeed)
---
# 合作伙伴
[![image](docs/images/302.jpg)](https://share.302.ai/mFS9MS)
zenfeed 是你的智能信息助手。它自动收集、筛选并总结关注的新闻或话题,然后发送给你。但我们可不是又造了一个 "今日头条"... 🤔
[302.AI](https://share.302.ai/mFS9MS)是一个按需付费的AI应用平台提供市面上最全的AI API和AI在线应用。
* 面向用户我们提供了50多种AI应用涵盖文字、图片和音视频各个领域无需月费按用量付费在线使用。
* 面向开发者一站式接入几乎所有AI应用开发需要用到的模型和API一站式付费统一接入。
* 面向企业管理与使用界面分离一人管理多人使用降低中小企业使用AI的门槛和成本。
![Zenfeed](docs/images/arch.png)
GitHub 一键登录 [注册一个](https://share.302.ai/mFS9MS) 试试吧!立即获得 1 美元额度
**For [RSS](https://zh.wikipedia.org/wiki/RSS) 老司机** 🚗
* zenfeed 可以是你的 AI 版 RSS 阅读器(配合 [zenfeed-web](https://github.com/glidea/zenfeed-web)
* [RSSHub](https://github.com/DIYgod/RSSHub) 的 [MCP](https://mcp.so/) Server
* 可自定义可信 RSS 数据源,且速度超快的 AI 搜索引擎
* 与 [Feedly AI](https://feedly.com/ai) 类似
---
# 正文
<p align="center">
<img src="docs/images/crad.png" alt="zenfeed cover image">
</p>
<p align="center">
<a href="https://app.codacy.com/gh/glidea/zenfeed/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade"><img src="https://app.codacy.com/project/badge/Grade/1b51f1087558402d85496fbe7bddde89"/></a>
<a href="https://sonarcloud.io/summary/new_code?id=glidea_zenfeed"><img src="https://sonarcloud.io/api/project_badges/measure?project=glidea_zenfeed&metric=sqale_rating"/></a>
<a href="https://goreportcard.com/badge/github.com/glidea/zenfeed"><img src="https://goreportcard.com/badge/github.com/glidea/zenfeed"/></a>
<a href="https://deepwiki.com/glidea/zenfeed"><img src="https://deepwiki.com/badge.svg"/></a>
</p>
<h3 align="center">在信息洪流Feed愿你保持禅定Zen</h3>
<p align="center">
zenfeed 是你的 <strong>AI 信息中枢</strong>。它既是<strong>智能 RSS 阅读器</strong>,也是实时<strong>"新闻"知识库</strong>,更能成为帮你时刻关注"指定事件",并呈递<strong>分析报告</strong>的私人秘书。
</p>
<p align="center">
<a href="https://zenfeed.xyz"><b>在线体验 (仅 RSS 阅读)</b></a>
&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;
<a href="https://github.com/xusonfan/zenfeedApp"><b>安卓版体验 (仅 RSS 阅读)</b></a>
&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;
<a href="docs/tech/hld-zh.md"><b>技术文档</b></a>
&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;
<a href="#-安装与使用"><b>快速开始</b></a>
</p>
> [!NOTE]
> DeepWiki 的描述并不完全准确
---
**epub2rss**: 把 epub 电子书转成每日更新一个章节的 RSS Feed[join waitlist](https://epub2rss.pages.dev/)
**one-coffee**: 一款类似 syft万物追踪的日报产品差异点支持播客等多模态高质量信源主攻 AI 领域)。下方加我微信加入 waitlist
---
## 💡 前言
RSS简易信息聚合诞生于 Web 1.0 时代,旨在解决信息分散的问题,让用户能在一个地方聚合、追踪多个网站的更新,无需频繁访问。它将网站更新以摘要形式推送给订阅者,便于快速获取信息。
然而,随着 Web 2.0 的发展和社交媒体、算法推荐的兴起RSS 并未成为主流。Google Reader 在 2013 年的关闭便是一个标志性事件。正如张一鸣在当时指出的RSS 对用户要求较高:需要较强的信息筛选能力和自律性来管理订阅源,否则很容易被信息噪音淹没。他认为,对于大多数用户而言,更轻松的"个性化推荐"是更优解,这也催生了后来的今日头条和抖音。
算法推荐确实降低了信息获取的门槛,但其过度迎合人性弱点,往往导致信息茧房和娱乐化沉溺。如果你希望从信息流中获取真正有价值的内容,反而需要更强的自制力去对抗算法的"投喂"。
那么,纯粹的 RSS 订阅是否就是答案?也不尽然。信息过载和筛选困难(信息噪音)依然是 RSS 用户面临的痛点。
孔子说凡事讲究中庸之道。我们能否找到一种折中的办法,既能享受 RSS 主动订阅带来的掌控感和高质量信源,又能借助技术手段克服其信息过载的弊端?
试试 zenfeed 吧!**AI + RSS**或许是这个时代更优的信息获取方式。zenfeed 旨在利用 AI 的能力帮你自动筛选、总结你所关注的信息让你在信息洪流Feed中保持禅定Zen
> 参考文章:[AI 复兴 RSS - 少数派](https://sspai.com/post/89494)
---
## ✨ 特性
![Zenfeed Architecture](docs/images/arch.png)
**专为 [RSS](https://zh.wikipedia.org/wiki/RSS) 老司机** 🚗
* 你的 AI 版 RSS 阅读器(配合 [zenfeed-web](https://github.com/glidea/zenfeed-web) 使用)
* 可作为 [RSSHub](https://github.com/DIYgod/RSSHub) 的 [MCP](https://mcp.so/) Server
* 可自定义可信 RSS 数据源,打造速度超快的个人 AI 搜索引擎
* 功能与 [Feedly AI](https://feedly.com/ai) 类似
<details>
<summary>预览</summary>
<img src="docs/images/feed-list-with-web.png" alt="" width="600">
<summary><b>预览</b></summary>
<br>
<img src="docs/images/feed-list-with-web.png" alt="Feed list" width="600">
<img src="docs/images/chat-with-feeds.png" alt="Chat with feeds" width="500">
</details>
**For [万物追踪](https://www.wwzzai.com/) 替代品寻觅者** 🔍
* zenfeed 同样拥有 [信息追踪能力](https://github.com/glidea/zenfeed/blob/main/docs/config-zh.md#%E8%B0%83%E5%BA%A6%E9%85%8D%E7%BD%AE-scheduls),且更强调高质量,自定义的数据源
* [AI 首席情报官](https://github.com/TeamWiseFlow/wiseflow?tab=readme-ov-file) 的 RSS 版,灵活版,更接近 PaaS 形态
**专为 [万物追踪](https://www.wwzzai.com/) 替代品寻觅者** 🔍
* 拥有强大的[信息追踪能力](https://github.com/glidea/zenfeed/blob/main/docs/config-zh.md#%E8%B0%83%E5%BA%A6%E9%85%8D%E7%BD%AE-scheduls),并更强调高质量、可自定义的数据源
* 可作为 [AI 首席情报官](https://github.com/TeamWiseFlow/wiseflow?tab=readme-ov-file) 的 RSS 版,更灵活,更接近引擎形态
<details>
<summary>预览</summary>
<img src="docs/images/monitoring.png" alt="" width="500">
<img src="docs/images/notification-with-web.png" alt="" width="500">
<summary><b>预览</b></summary>
<br>
<img src="docs/images/monitoring.png" alt="Monitoring setup" width="500">
<img src="docs/images/notification-with-web.png" alt="Notification example" width="500">
</details>
**For 信息焦虑症患者(比如我)** 😌
* "zenfeed" 是 "zen" 和 "feed" 的组合,意为在 feed信息洪流愿你保持 zen禅定
* 如果你对时不时地刷信息流感到焦虑疲惫,这是因为上下文切换的成本比想象得高,同时也妨碍了你进入心流。推荐你试试简报功能,每天固定时间收到对应时间段的简报邮件,从而一次性地,快速地,总览地完成阅读。啊哈有点文艺复兴的意味是吗 ✨
**专为 信息焦虑症患者 (比如我)** 😌
* 如果你对频繁刷信息流感到疲惫,试试简报功能。每日定时收取指定时段的 AI 简报,一次性、总览式地高效阅读,告别上下文切换的隐性成本。啊哈有点文艺复兴的意味是吗 ✨
* "zenfeed" 是 "zen" 和 "feed" 的组合,意为在 feed信息洪流愿你保持 zen禅定
<details>
<summary>预览</summary>
<img src="docs/images/daily-brief.png" alt="" width="500">
<summary><b>预览</b></summary>
<br>
<img src="docs/images/daily-brief.png" alt="Daily brief example" width="500">
</details>
**For AI 内容处理的探索者** 🔬
* zenfeed 有一种对内容进行管道化处理的自定义机制,类似 Prometheus [Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config)
* 每篇内容都被抽象成一个标签集合(比如标题,来源,正文... 都是标签),在管道的每一个节点,可以基于自定义 Prompt 对特定标签值进行处理(比如评分、分类、摘要、过滤、添加新标签等...),而后基于标签查询过滤,[路由](https://github.com/glidea/zenfeed/blob/main/docs/config-zh.md#%E9%80%9A%E7%9F%A5%E8%B7%AF%E7%94%B1%E9%85%8D%E7%BD%AE-notifyroute-%E5%8F%8A-notifyroutesub_routes)[展示](https://github.com/glidea/zenfeed/blob/main/docs/config-zh.md#%E9%80%9A%E7%9F%A5%E6%B8%A0%E9%81%93-email-%E9%85%8D%E7%BD%AE-notifychannelsemail)... See [Rewrite Rules](https://github.com/glidea/zenfeed/blob/main/docs/config-zh.md#%E9%87%8D%E5%86%99%E8%A7%84%E5%88%99%E9%85%8D%E7%BD%AE-storagefeedrewrites)
* 重要的是你可以灵活的编排这一切,这赋予了 zenfeed 浓重的工具化,个性化色彩。欢迎通过 Push API 集成私有数据,探索更多的可能性
**专为 开发者** 🔬
* **管道化处理机制**: 类似 Prometheus 的 [Relabeling](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config)zenfeed 将每篇内容抽象为标签集,你可以在管道的每个节点,通过自定义 Prompt 对标签进行处理(评分、分类、摘要、过滤等)。
* **灵活编排**: 基于处理后的标签,你可以自由地进行查询、过滤、[路由](https://github.com/glidea/zenfeed/blob/main/docs/config-zh.md#%E9%80%9A%E7%9F%A5%E8%B7%AF%E7%94%B1%E9%85%8D%E7%BD%AE-notifyroute-%E5%8F%8A-notifyroutesub_routes)和[通知](https://github.com/glidea/zenfeed/blob/main/docs/config-zh.md#%E9%80%9A%E7%9F%A5%E6%B8%A0%E9%81%93-email-%E9%85%8D%E7%BD%AE-notifychannelsemail),赋予了 zenfeed 浓厚的工具化、个性化色彩。详情请见 [Rewrite Rules](docs/tech/rewrite-zh.md)
* **开放的 API**:
* [Query API](/docs/query-api-zh.md)
* [RSS Exported API](/docs/rss-api-zh.md)
* [Notify Webhook](/docs/webhook-zh.md)
* [大量声明式 YAML 配置](/docs/config-zh.md)
<details>
<summary>预览</summary>
<img src="docs/images/update-config-with-web.png" alt="" width="500">
<summary><b>预览</b></summary>
<br>
<img src="docs/images/update-config-with-web.png" alt="Update config via web" width="500">
</details>
<p align="center">
<a href="docs/preview.md"><b>➡️ 查看更多效果预览</b></a>
</p>
**For 吃瓜群众** 🍉
---
就冲这精美的邮件样式,请立即安装使用
## 🚀 安装使用
<img src="docs/images/monitoring.png" alt="" width="400">
### 1. 准备工作
[更多效果预览](docs/preview.md)
> [!IMPORTANT]
> zenfeed 默认使用 [硅基流动](https://cloud.siliconflow.cn/) 提供的模型服务。
> * 模型: `Qwen/Qwen3-8B` (免费) 和 `Qwen/Qwen3-Embedding-4B`。
> * **!!!如果你愿意赞助本项目,将获赠一定额度的 Gemini 2.5 Pro/Flash!!! (见下方)**
> * 如果你还没有硅基账号,使用 [**邀请链接**](https://cloud.siliconflow.cn/i/U2VS0Q5A) 可获得 **14 元** 赠送额度。
> * 如果需要使用其他厂商或模型,或进行更详细的自定义部署,请参考 [配置文档](https://github.com/glidea/zenfeed/blob/main/docs/config-zh.md) 来编辑 `docker-compose.yml`。
## 安装与使用
### 2. 一键部署
### 1. 安装
> 最快 1 分钟拉起服务。
替换下方 APIKey 等参数,并完整复制到终端一键执行。注意:
1. `provider` 除了硅基还支持 openai, openrouter, deepseek, gemini, volc火山(keng)引擎)。也可自定义,参考 [配置文档](docs/config-zh.md)。需要自定义其它参数的大佬也可参考
2. `llms[0].model` 默认会用来总结内容,相对耗费 Token一般 Qwen/Qwen2.5-7B-Instruct免费足够当然米够的话越强越好。如果你还没有硅基账号使用 [邀请链接](https://cloud.siliconflow.cn/i/U2VS0Q5A) 得 14 元额度
#### Mac/Linux
#### Mac / Linux
```bash
docker run --rm \
-v "$(PWD):/app" \
-w /app \
--entrypoint sh \
mikefarah/yq -c '
set -e
mkdir -p zenfeed/config && cd zenfeed
TEMPLATE_URL="https://raw.githubusercontent.com/glidea/zenfeed/main/install/config-template.yaml"
COMPOSE_URL="https://raw.githubusercontent.com/glidea/zenfeed/main/install/docker-compose.yml"
CONFIG_OUTPUT="config/config.yaml"
COMPOSE_OUTPUT="docker-compose.yml"
# 下载配置文件
curl -L -O https://raw.githubusercontent.com/glidea/zenfeed/main/docker-compose.yml
wget -qO- "$TEMPLATE_URL" | yq \
".timezone = \"Asia/Shanghai\" |
.llms[0].provider = \"siliconflow\" |
.llms[0].model = \"Qwen/Qwen2.5-7B-Instruct\" |
.llms[0].api_key = \"your_api_key\" | # <<<--- 替换 API Key! 其它参数按需选择
.llms[1].provider = \"siliconflow\" |
.llms[1].embedding_model = \"Pro/BAAI/bge-m3\" |
.llms[1].api_key = \"your_api_key\" | # <<<--- 替换 API Key!
.storage.feed.rewrites[0].transform.to_text.prompt = \"{{.summary_html_snippet}}使用中文回复\"" \
> "$CONFIG_OUTPUT"
wget -qO "$COMPOSE_OUTPUT" "$COMPOSE_URL"
' && cd zenfeed && docker compose up -d --wait
# 启动服务 (请替换你的 API_KEY)
API_KEY="sk-..." docker-compose -p zenfeed up -d
```
#### Windows
> 使用 PowerShell 执行
#### Windows (PowerShell)
```powershell
docker run --rm `
-v "${PWD}:/app" `
-w /app `
--entrypoint sh `
mikefarah/yq -c '
set -e
mkdir -p zenfeed/config && cd zenfeed
TEMPLATE_URL="https://raw.githubusercontent.com/glidea/zenfeed/main/install/config-template.yaml"
COMPOSE_URL="https://raw.githubusercontent.com/glidea/zenfeed/main/install/docker-compose.yml"
CONFIG_OUTPUT="config/config.yaml"
COMPOSE_OUTPUT="docker-compose.yml"
# 下载配置文件
Invoke-WebRequest -Uri "https://raw.githubusercontent.com/glidea/zenfeed/main/docker-compose.yml" -OutFile "docker-compose.yml"
wget -qO- "$TEMPLATE_URL" | yq \
".timezone = \"Asia/Shanghai\" |
.llms[0].provider = \"siliconflow\" |
.llms[0].model = \"Qwen/Qwen2.5-7B-Instruct\" |
.llms[0].api_key = \"your_api_key\" | # <<<--- 替换 API Key! 其它参数按需选择
.llms[1].provider = \"siliconflow\" |
.llms[1].embedding_model = \"Pro/BAAI/bge-m3\" |
.llms[1].api_key = \"your_api_key\" | # <<<--- 替换 API Key!
.storage.feed.rewrites[0].transform.to_text.prompt = \"{{.summary_html_snippet}}使用中文回复\"" \
> "$CONFIG_OUTPUT"
wget -qO "$COMPOSE_OUTPUT" "$COMPOSE_URL"
' ; cd zenfeed; docker compose up -d --wait
# 启动服务 (请替换你的 API_KEY)
$env:API_KEY = "sk-..."; docker-compose -p zenfeed up -d
```
### 2. 使用 Web 端
🎉 **部署完成!**
访问 http://localhost:1400
访问 https://zenfeed-web.pages.dev
> [!WARNING]
> * 如果将 zenfeed 部署在 VPS 等公网环境,请通过 `http://<你的IP>:1400` 访问,并确保防火墙/安全组已放行 `1400` 端口。
> * **安全提示:** zenfeed 尚无认证机制,将服务暴露到公网可能会泄露您的 `API_KEY`。请务必配置严格的安全组规则,仅对信任的 IP 开放访问。
> 会默认连接本地的 zenfeed
### 3. 开始使用
> 安卓版https://github.com/xusonfan/zenfeedApp
#### 添加 RSS 订阅源
<img src="docs/images/web-add-source.png" alt="" width="400">
<img src="docs/images/web-add-source.png" alt="Add RSS source via web" width="400">
> 从 Follow 迁移过来,参考 [migrate-from-follow.md](docs/migrate-from-follow.md)
> * 从 Follow 迁移,参考 [migrate-from-follow.md](docs/migrate-from-follow.md)
> * 添加后 zenfeed 需要访问源站,请保证网络畅通。
> * 添加后请稍等几分钟,等待内容抓取和处理,尤其是在模型有严格速率限制的情况下。
#### 配置每日简报监控等
#### 配置每日简报监控等
<img src="docs/images/notification-with-web.png" alt="" width="400">
<img src="docs/images/notification-with-web.png" alt="Configure notifications via web" width="400">
### 3. 配置 MCP可选
以 Cherry Studio 为例,配置 MCP 并连接到 Zenfeed见 [Cherry Studio MCP](docs/cherry-studio-mcp.md)
> 默认地址 http://localhost:1301/sse
#### 配置 MCP可选
以 Cherry Studio 为例,配置 MCP 并连接到 Zenfeed见 [Cherry Studio MCP](docs/cherry-studio-mcp.md)
> 默认地址 `http://localhost:1301/sse`
## Roadmap
* P0大概率会做
* 支持生成播客,男女对话,类似 NotebookLM
* 更多数据源
* 邮件
* 网页剪藏 Chrome 插件
* P1可能
* 关键词搜索
* 支持搜索引擎作为数据源
* APP
* 以下是由于版权风险,暂时不推进
* 支持 Webhook 通知
* 爬虫
#### More...
页面暂时没法表达 zenfeed 强大的灵活性,更多玩法请查阅[配置文档](docs/config-zh.md)
> 进展会第一时间在 [Linux Do](https://linux.do/u/ajd/summary) 更新
---
## 有任何问题与反馈,欢迎加群讨论
## 🗺️ Roadmap
<img src="docs/images/wechat.png" alt="Wechat" width="150">
我们规划了一些很 cool 的功能,欢迎查看 [Roadmap](/docs/roadmap-zh.md) 并提出你的建议!
都看到这里了,顺手点个 Star ⭐️ 呗,用于防止我太监掉
---
## 注意
* 1.0 版本之前不保证兼容性
* 项目采用 AGPL3 协议,任何 Fork 都需要开源
* 商用请联系报备,可提供合理范围内的支持。注意是合法商用哦,不欢迎搞灰色
* 数据不会永久保存,默认只存储 8 天
## 💬 交流与支持
## 免责声明 (Disclaimer)
> **使用问题请优先提 [Issue](https://github.com/glidea/zenfeed/issues)**,这能帮助到有类似问题的朋友,也能更好地追踪和解决问题。
**在使用 `zenfeed` 软件(以下简称“本软件”)前,请仔细阅读并理解本免责声明。您的下载、安装、使用本软件或任何相关服务的行为,即表示您已阅读、理解并同意接受本声明的所有条款。如果您不同意本声明的任何内容,请立即停止使用本软件。**
<table>
<tr>
<td align="center">
<img src="https://github.com/glidea/zenfeed/blob/main/docs/images/wechat.png?raw=true" alt="Wechat QR Code" width="300">
<br>
<strong>AI 学习交流社群</strong>
</td>
<td align="center">
<img src="https://github.com/glidea/banana-prompt-quicker/blob/main/images/glidea.png?raw=true" width="250">
<br>
<strong><a href="https://glidea.zenfeed.xyz/">我的其它项目</a></strong>
</td>
</tr>
<tr>
<td align="center" colspan="2">
<img src="https://github.com/glidea/banana-prompt-quicker/blob/main/images/readnote.png?raw=true" width="400">
<br>
<strong><a href="https://www.xiaohongshu.com/user/profile/5f7dc54d0000000001004afb">📕 小红书账号 - 持续分享 AI 原创</a></strong>
</td>
</tr>
</table>
1. **“按原样”提供:** 本软件按“现状”和“可用”的基础提供,不附带任何形式的明示或默示担保。项目作者和贡献者不对本软件的适销性、特定用途适用性、非侵权性、准确性、完整性、可靠性、安全性、及时性或性能做出任何保证或陈述。
都看到这里了,顺手点个 **Star ⭐️** 呗,这是我持续维护的最大动力!
有好玩的 AI 工作也请联系我!
---
## 📝 注意事项与免责声明
### 注意事项
* **版本兼容性:** 1.0 版本之前不保证 API 和配置的向后兼容性。
* **开源协议:** 项目采用 AGPLv3 协议,任何 Fork 和分发都必须保持开源。
* **商业使用:** 商用请联系作者报备,可在合理范围内提供支持。我们欢迎合法的商业用途,不欢迎利用本项目从事灰色产业。
* **数据存储:** 数据不会永久保存,默认只存储 8 天。
### 鸣谢
* 感谢 [eryajf](https://github.com/eryajf) 提供的 [Compose Inline Config](https://github.com/glidea/zenfeed/issues/1) 建议,让部署更易理解。
* [![Powered by DartNode](https://dartnode.com/branding/DN-Open-Source-sm.png)](https://dartnode.com "Powered by DartNode - Free VPS for Open Source")
### 欢迎贡献
* 目前贡献规范尚在完善,但我们坚守一个核心原则:"代码风格一致性"。
### 免责声明 (Disclaimer)
<details>
<summary><strong>点击展开查看完整免责声明</strong></summary>
**在使用 `zenfeed` 软件(以下简称"本软件")前,请仔细阅读并理解本免责声明。您的下载、安装、使用本软件或任何相关服务的行为,即表示您已阅读、理解并同意接受本声明的所有条款。如果您不同意本声明的任何内容,请立即停止使用本软件。**
1. **"按原样"提供:** 本软件按"现状"和"可用"的基础提供,不附带任何形式的明示或默示担保。项目作者和贡献者不对本软件的适销性、特定用途适用性、非侵权性、准确性、完整性、可靠性、安全性、及时性或性能做出任何保证或陈述。
2. **用户责任:** 您将对使用本软件的所有行为承担全部责任。这包括但不限于:
* **数据源选择:** 您自行负责选择并配置要接入的数据源(如 RSS feeds、未来可能的 Email 源等)。您必须确信您有权访问和处理这些数据源的内容,并遵守其各自的服务条款、版权政策及相关法律法规。
@@ -199,3 +275,5 @@ docker run --rm `
**请再次注意:使用本软件抓取、处理和分发受版权保护的内容可能存在法律风险。用户有责任确保其使用行为符合所有适用的法律法规和第三方服务条款。对于任何因用户滥用或不当使用本软件而引起的法律纠纷或损失,项目作者和贡献者不承担任何责任。**
</details>

80
docker-compose.yml Normal file
View File

@@ -0,0 +1,80 @@
services:
zenfeed-web:
image: glidea/zenfeed-web:latest
ports:
- "1400:1400"
environment:
- PUBLIC_DEFAULT_API_URL=http://zenfeed:1300
depends_on:
- zenfeed
restart: unless-stopped
zenfeed:
image: glidea/zenfeed:latest
entrypoint: >
sh -c "
if [ ! -f /app/config/config.yaml ]; then
echo 'Config file not found in volume, initializing from init config...'
cp /app/config.init.yaml /app/config/config.yaml;
else
echo 'Existing config file found in volume.'
fi &&
echo 'Starting Zenfeed...' &&
exec /app/zenfeed --config /app/config/config.yaml
"
configs:
- source: zenfeed_init_config
target: /app/config.init.yaml
volumes:
- data:/app/data
- config:/app/config
ports:
- "1300:1300"
- "1301:1301"
- "9090:9090"
depends_on:
- rsshub
restart: unless-stopped
rsshub:
image: diygod/rsshub:2024-12-14
ports:
- "1200:1200"
environment:
- NODE_ENV=production
restart: unless-stopped
volumes:
data: {}
config: {}
configs:
zenfeed_init_config: # After installation, you must modify the configuration through zenfeed or config volume.
content: |
timezone: ${TZ:-Asia/Shanghai}
llms:
- name: general
default: true
provider: siliconflow
model: Qwen/Qwen3-8B
api_key: ${API_KEY:-your-api-key}
- name: embed
provider: siliconflow
embedding_model: Qwen/Qwen3-Embedding-4B
api_key: ${API_KEY:-your-api-key}
scrape:
rsshub_endpoint: http://rsshub:1200
storage:
feed:
rewrites:
- transform:
to_text:
prompt: |
{{ .summary_html_snippet_for_small_model }} Respond in ${LANGUAGE:-Chinese}
label: summary_html_snippet
embedding_llm: embed
notify:
channels:
email:
feed_html_snippet_template: |
{{ .summary_html_snippet }}

View File

@@ -1,17 +1,17 @@
**配置 MCP Server**
**Configure MCP Server**
默认 URL: `http://localhost:1301/sse`
Default URL: `http://localhost:1301/sse`
<img src="images/cherry-studio-mcp.png" alt="Cherry Studio MCP" width="500">
**配置 Prompt可选但不使用效果可能不符合预期**
**Configure Prompt (Optional but recommended for optimal results)**
完整 Prompt 见 [mcp-client-prompt.md](mcp-client-prompt.md)
For complete prompt, see [mcp-client-prompt.md](mcp-client-prompt.md)
<img src="images/cherry-studio-mcp-prompt.png" alt="Cherry Studio MCP Prompt" width="500">
**玩法参考**
**Usage Examples**
[Doc](preview.md)
非常强大,还可以直接修改 zenfeed 配置项
Very powerful - you can even directly modify zenfeed configuration settings

View File

@@ -1,19 +1,22 @@
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :--------- | :------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------- | :------------- |
| `timezone` | `string` | 应用的时区。例如 `Asia/Shanghai`。 | 服务器本地时区 | 否 |
| `log` | `object` | 日志配置。详见下方的 **日志配置** 部分。 | (见具体字段) | 否 |
| `api` | `object` | API 配置。详见下方的 **API 配置** 部分。 | (见具体字段) | 否 |
| `llms` | `列表` | 大语言模型 (LLM) 配置。会被其他配置部分引用。详见下方的 **LLM 配置** 部分。 | `[]` | 是 (至少 1 个) |
| `scrape` | `object` | 抓取配置。详见下方的 **抓取配置** 部分。 | (见具体字段) | 否 |
| `storage` | `object` | 存储配置。详见下方的 **存储配置** 部分。 | (见具体字段) | 否 |
| `scheduls` | `object` | 用于监控 Feed 的调度配置 (也称为监控规则)。详见下方的 **调度配置** 部分。 | (见具体字段) | 否 |
| `notify` | `object` | 通知配置。它接收来自调度模块的结果,通过路由配置进行分组,并通过通知渠道发送给通知接收者。详见下方的 **通知配置**, **通知路由**, **通知接收者**, **通知渠道** 部分。 | (见具体字段) | |
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :---------- | :------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------- | :------------- |
| `timezone` | `string` | 应用的时区。例如 `Asia/Shanghai`。 | 服务器本地时区 | 否 |
| `telemetry` | `object` | Telemetry 配置。详见下方的 **Telemetry 配置** 部分。 | (见具体字段) | 否 |
| `api` | `object` | API 配置。详见下方的 **API 配置** 部分。 | (见具体字段) | 否 |
| `llms` | `列表` | 大语言模型 (LLM) 配置。会被其他配置部分引用。详见下方的 **LLM 配置** 部分。 | `[]` | 是 (至少 1 个) |
| `jina` | `object` | Jina AI 配置。详见下方的 **Jina AI 配置** 部分。 | (见具体字段) | 否 |
| `scrape` | `object` | 抓取配置。详见下方的 **抓取配置** 部分。 | (见具体字段) | 否 |
| `storage` | `object` | 存储配置。详见下方的 **存储配置** 部分。 | (见具体字段) | 否 |
| `scheduls` | `object` | 用于监控 Feed 的调度配置 (也称为监控规则)。详见下方的 **调度配置** 部分。 | (见具体字段) | |
| `notify` | `object` | 通知配置。它接收来自调度模块的结果,通过路由配置进行分组,并通过通知渠道发送给通知接收者。详见下方的 **通知配置**, **通知路由**, **通知接收者**, **通知渠道** 部分。 | (见具体字段) | 是 |
### 日志配置 (`log`)
### Telemetry 配置 (`telemetry`)
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :---------- | :------- | :--------------------------------------------------------- | :----- | :------- |
| `log.level` | `string` | 日志级别, 可选值为 `debug`, `info`, `warn`, `error` 之一。 | `info` | 否 |
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :-------------------- | :------- | :----------------------------------------------------------------------------- | :----------- | :------- |
| `telemetry.address` | `string` | 暴露 Prometheus 指标 & pprof。 | `:9090` | 否 |
| `telemetry.log` | `object` | Telemetry 相关的日志配置。 | (见具体字段) | 否 |
| `telemetry.log.level` | `string` | Telemetry 相关消息的日志级别, 可选值为 `debug`, `info`, `warn`, `error` 之一。 | `info` | 否 |
### API 配置 (`api`)
@@ -38,15 +41,25 @@
| `llms[].api_key` | `string` | LLM 的 API 密钥。 | | 是 |
| `llms[].model` | `string` | LLM 的模型。例如 `gpt-4o-mini`。如果用于生成任务 (如总结),则不能为空。如果此 LLM 被使用,则不能与 `embedding_model` 同时为空。 | | 条件性必需 |
| `llms[].embedding_model` | `string` | LLM 的 Embedding 模型。例如 `text-embedding-3-small`。如果用于 Embedding则不能为空。如果此 LLM 被使用,则不能与 `model` 同时为空。**注意:** 初次使用后请勿直接修改,应添加新的 LLM 配置。 | | 条件性必需 |
| `llms[].tts_model` | `string` | LLM 的文本转语音 (TTS) 模型。 | | 否 |
| `llms[].temperature` | `float32` | LLM 的温度 (0-2)。 | `0.0` | 否 |
### Jina AI 配置 (`jina`)
此部分用于配置 Jina AI Reader API 的相关参数,主要供重写规则中的 `crawl_by_jina` 类型使用。
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :----------- | :------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :------- |
| `jina.token` | `string` | Jina AI 的 API Token。从 [Jina AI API Dashboard](https://jina.ai/api-dashboard/) 获取。提供 Token 可以获得更高的服务速率限制。如果留空,将以匿名用户身份请求,速率限制较低。 | | 否 |
### 抓取配置 (`scrape`)
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :----------------------- | :-------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :---------------------------------- |
| `scrape.past` | `time.Duration` | 抓取 Feed 的回溯时间窗口。例如 `1h` 表示只抓取过去 1 小时的 Feed。 | `3d` | 否 |
| `scrape.past` | `time.Duration` | 抓取 Feed 的回溯时间窗口。例如 `1h` 表示只抓取过去 1 小时的 Feed。 | `24h` | 否 |
| `scrape.interval` | `time.Duration` | 抓取每个源的频率 (全局默认值)。例如 `1h`。 | `1h` | 否 |
| `scrape.rsshub_endpoint` | `string` | RSSHub 的端点。你可以部署自己的 RSSHub 服务器或使用公共实例 (参见 [RSSHub 文档](https://docs.rsshub.app/guide/instances))。例如 `https://rsshub.app`。 | | 是 (如果使用了 `rsshub_route_path`) |
| `scrape.rsshub_access_key` | `string` | RSSHub 的访问密钥。用于访问控制。(详情见 [RSSHub文档访问控制](https://docs.rsshub.app/deploy/config#access-control-configurations)) | | 否 |
| `scrape.sources` | `对象列表` | 用于抓取 Feed 的源列表。详见下方的 **抓取源配置**。 | `[]` | 是 (至少一个) |
### 抓取源配置 (`scrape.sources[]`)
@@ -69,10 +82,11 @@
### 存储配置 (`storage`)
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :------------- | :------- | :-------------------------------------------- | :----------- | :------- |
| `storage.dir` | `string` | 所有存储的基础目录。应用运行后不可更改。 | `./data` | 否 |
| `storage.feed` | `object` | Feed 存储配置。详见下方的 **Feed 存储配置**。 | (见具体字段) | 否 |
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :--------------- | :------- | :-------------------------------------------------------------- | :----------- | :------- |
| `storage.dir` | `string` | 所有存储的基础目录。应用运行后不可更改。 | `./data` | 否 |
| `storage.feed` | `object` | Feed 存储配置。详见下方的 **Feed 存储配置** | (见具体字段) | 否 |
| `storage.object` | `object` | 对象存储配置,用于存储播客等文件。详见下方的 **对象存储配置**。 | (见具体字段) | 否 |
### Feed 存储配置 (`storage.feed`)
@@ -84,32 +98,61 @@
| `storage.feed.retention` | `time.Duration` | Feed 的保留时长。 | `8d` | 否 |
| `storage.feed.block_duration` | `time.Duration` | 每个基于时间的 Feed 存储块的保留时长 (类似于 Prometheus TSDB Block)。 | `25h` | 否 |
### 对象存储配置 (`storage.object`)
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :--------------------------------- | :------- | :----------------------------- | :----- | :-------------------- |
| `storage.object.endpoint` | `string` | 对象存储的端点。 | | 是 (如果使用播客功能) |
| `storage.object.access_key_id` | `string` | 对象存储的 Access Key ID。 | | 是 (如果使用播客功能) |
| `storage.object.secret_access_key` | `string` | 对象存储的 Secret Access Key。 | | 是 (如果使用播客功能) |
| `storage.object.bucket` | `string` | 对象存储的存储桶名称。 | | 是 (如果使用播客功能) |
| `storage.object.bucket_url` | `string` | 对象存储的桶访问 URL。 | | 否 |
### 重写规则配置 (`storage.feed.rewrites[]`)
定义在存储前处理 Feed 的规则。规则按顺序应用。
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :--------------------------------------- | :------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- | :--------------------------------------------- |
| `...rewrites[].source_label` | `string` | 用作转换源文本的 Feed 标签。默认标签包括: `type`, `source`, `title`, `link`, `pub_time`, `content` | `content` | 否 |
| `...rewrites[].skip_too_short_threshold` | `*int` | 如果设置,`source_label` 文本长度低于此阈值的 Feed 将被此规则跳过 (处理将继续进行下一条规则,如果没有更多规则则进行 Feed 存储)。有助于过滤掉过短/信息量不足的 Feed。 | `300` | 否 |
| `...rewrites[].transform` | `object` | 配置如何转换 `source_label` 文本。详见下方的 **重写规则转换配置**。如果未设置,则直接使用 `source_label` 文本进行匹配。 | `nil` | 否 |
| `...rewrites[].match` | `string` | 用于匹配 (转换后) 文本的简单字符串。不能与 `match_re` 同时设置 | | 否 (使用 `match``match_re`) |
| `...rewrites[].match_re` | `string` | 用于匹配 (转换后) 文本的正则表达式。 | `.*` (匹配所有) | 否 (使用 `match``match_re`) |
| `...rewrites[].action` | `string` | 匹配时执行的操作: `create_or_update_label` (使用匹配/转换后的文本添加/更新标签), `drop_feed` (完全丢弃该 Feed)。 | `create_or_update_label` | 否 |
| `...rewrites[].label` | `string` | 要创建或更新的 Feed 标签名称。 | | 是 (如果 `action``create_or_update_label`) |
### 重写规则转换配置 (`storage.feed.rewrites[].transform`)
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :--------------------- | :------- | :------------------------------------------------------------------- | :----- | :------- |
| `...transform.to_text` | `object` | 使用 LLM 将源文本转换为文本。详见下方的 **重写规则转换为文本配置**。 | `nil` | 否 |
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :--------------------------------------- | :----------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- | :--------------------------------------------- |
| `...rewrites[].if` | `字符串列表` | 用于匹配 Feed 的条件配置。如果未设置,则表示匹配所有 Feed。类似于标签过滤器例如 `["source=github", "title!=xxx"]`。如果条件不满足,则跳过此规则。 | `[]` (匹配所有) | 否 |
| `...rewrites[].source_label` | `string` | 用作转换源文本的 Feed 标签。默认标签包括: `type`, `source`, `title`, `link`, `pub_time`, `content`。 | `content` | 否 |
| `...rewrites[].skip_too_short_threshold` | `*int` | 如果设置,`source_label` 文本长度低于此阈值的 Feed 将被此规则跳过 (处理将继续进行下一条规则,如果没有更多规则则进行 Feed 存储)。有助于过滤掉过短/信息量不足的 Feed。 | `300` | 否 |
| `...rewrites[].transform` | `object` | 配置如何转换 `source_label` 文本。详见下方的 **重写规则转换配置**。如果未设置,则直接使用 `source_label` 文本进行匹配| `nil` | 否 |
| `...rewrites[].match` | `string` | 用于匹配 (转换后) 文本的简单字符串。不能与 `match_re` 同时设置。 | | 否 (使用 `match``match_re`) |
| `...rewrites[].match_re` | `string` | 用于匹配 (转换后) 文本的正则表达式。 | `.*` (匹配所有) | 否 (使用 `match``match_re`) |
| `...rewrites[].action` | `string` | 匹配时执行的操作: `create_or_update_label` (使用匹配/转换后的文本添加/更新标签), `drop_feed` (完全丢弃该 Feed)。 | `create_or_update_label` | 否 |
| `...rewrites[].label` | `string` | 要创建或更新的 Feed 标签名称。 | | 是 (如果 `action``create_or_update_label`) |
| `...transform.to_text` | `object` | 使用 LLM 将源文本转换为文本。详见下方的 **重写规则转换为文本配置**。 | `nil` | 否 |
| `...transform.to_podcast` | `object` | 将源文本转换为播客。详见下方的 **重写规则转换为播客配置**。 | `nil` | 否 |
### 重写规则转换为文本配置 (`storage.feed.rewrites[].transform.to_text`)
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :------------------ | :------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :---------------------- | :------- |
| `...to_text.llm` | `string` | 用于转换的 LLM 名称 (来自 `llms` 部分)。 | `llms` 部分中的默认 LLM | 否 |
| `...to_text.prompt` | `string` | 用于转换的 Prompt。源文本将被注入。可以使用 Go 模板语法引用内置 Prompt: `{{ .summary }}`, `{{ .category }}`, `{{ .tags }}`, `{{ .score }}`, `{{ .comment_confucius }}`, `{{ .summary_html_snippet }}`。 | | 是 |
此配置定义了如何将 `source_label` 的文本进行转换。
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :------------------ | :------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------- | :--------------------------- |
| `...to_text.type` | `string` | 转换的类型。可选值:<ul><li>`prompt` (默认): 使用 LLM 和指定的 Prompt 转换源文本。</li><li>`crawl`: 将源文本视为 URL直接抓取该 URL 指向的网页内容,并将其转换为 Markdown 格式。此方式为本地抓取,会尝试遵循 `robots.txt`。</li><li>`crawl_by_jina`: 将源文本视为 URL通过 [Jina AI Reader API](https://jina.ai/reader/) 抓取和处理网页内容,并返回 Markdown。功能可能更强大例如处理动态页面但依赖 Jina AI 服务。</li></ul> | `prompt` | 否 |
| `...to_text.llm` | `string` | **仅当 `type` 为 `prompt` 时有效。** 用于转换的 LLM 名称 (来自 `llms` 部分)。如果未指定,将使用在 `llms` 部分中标记为 `default: true` 的 LLM。 | `llms` 部分中的默认 LLM | 否 |
| `...to_text.prompt` | `string` | **仅当 `type` 为 `prompt` 时有效。** 用于转换的 Prompt。源文本将被注入。可以使用 Go 模板语法引用内置 Prompt: `{{ .summary }}`, `{{ .category }}`, `{{ .tags }}`, `{{ .score }}`, `{{ .comment_confucius }}`, `{{ .summary_html_snippet }}`, `{{ .summary_html_snippet_for_small_model }}`。 | | 是 (如果 `type``prompt`) |
### 重写规则转换为播客配置 (`storage.feed.rewrites[].transform.to_podcast`)
此配置定义了如何将 `source_label` 的文本转换为播客。
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :------------------------------------------- | :--------- | :-------------------------------------------------------------------------------------------------------- | :---------------------- | :------- |
| `...to_podcast.llm` | `string` | 用于生成播客稿件的 LLM 名称 (来自 `llms` 部分)。 | `llms` 部分中的默认 LLM | 否 |
| `...to_podcast.transcript_additional_prompt` | `string` | 附加到播客稿件生成 Prompt 的额外指令。 | | 否 |
| `...to_podcast.tts_llm` | `string` | 用于文本转语音 (TTS) 的 LLM 名称 (来自 `llms` 部分)。**注意:目前仅支持 `provider``gemini` 的 LLM**。 | `llms` 部分中的默认 LLM | 否 |
| `...to_podcast.speakers` | `对象列表` | 播客的演讲者列表。详见下方的 **演讲者配置**。 | `[]` | 是 |
#### 演讲者配置 (`...to_podcast.speakers[]`)
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :-------------------- | :------- | :------------------------ | :----- | :------- |
| `...speakers[].name` | `string` | 演讲者的名字。 | | 是 |
| `...speakers[].role` | `string` | 演讲者的角色描述 (人设)。 | | 否 |
| `...speakers[].voice` | `string` | 演讲者的声音。 | | 是 |
### 调度配置 (`scheduls`)
@@ -121,14 +164,15 @@
### 调度规则配置 (`scheduls.rules[]`)
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :-------------------------------- | :-------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :---------------------------------------- |
| `scheduls.rules[].name` | `string` | 规则的名称。 | | 是 |
| `scheduls.rules[].query` | `string` | 用于查找相关 Feed 的语义查询。可选。 | | 否 |
| `scheduls.rules[].threshold` | `float32` | 相关性得分阈值 (0-1),用于过滤语义查询结果。仅在设置了 `query` 时有效。 | `0.6` | 否 |
| `scheduls.rules[].label_filters` | `字符串列表` | 基于 Feed 标签的过滤器 (等于或不等于)。例如 `["category=tech", "source!=github"]`。 | `[]` | 否 |
| `scheduls.rules[].every_day` | `string` | 相对于每天结束时间的查询范围。格式: `start~end` (HH:MM)。例如, `00:00~23:59` (今天), `-22:00~07:00` (昨天 22:00 到今天 07:00)。不能与 `watch_interval` 同时设置。 | | 否 (使用 `every_day``watch_interval`) |
| `scheduls.rules[].watch_interval` | `time.Duration` | 运行查询的频率。例如 `10m`。不能与 `every_day` 同时设置。 | `10m` | 否 (使用 `every_day``watch_interval`) |
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :-------------------------------- | :------------------ | :---------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----- | :---------------------------------------- |
| `scheduls.rules[].name` | `string` | 规则的名称。 | | 是 |
| `scheduls.rules[].query` | `string` | 用于查找相关 Feed 的语义查询。可选。 | | 否 |
| `scheduls.rules[].threshold` | `float32` | 相关性得分阈值 (0-1),用于过滤语义查询结果。仅在设置了 `query` 时有效。 | `0.6` | 否 |
| `scheduls.rules[].label_filters` | `字符串列表` | 基于 Feed 标签的过滤器 (等于或不等于)。例如 `["category=tech", "source!=github"]`。 | `[]` | 否 |
| `scheduls.rules[].labels` | `map[string]string` | 附加到此源 Feed 的额外键值标签。 | `{}` | 否 |
| `scheduls.rules[].every_day` | `string` | 相对于每天结束时间的查询范围。格式: `start~end` (HH:MM)。例如, `00:00~23:59` (今天), `-22:00~07:00` (昨天 22:00 到今天 07:00)。不能与 `watch_interval` 同时设置。 | | 否 (使用 `every_day``watch_interval`) |
| `scheduls.rules[].watch_interval` | `time.Duration` | 运行查询的频率。例如 `10m`。不能与 `every_day` 同时设置。 | `10m` | 否 (使用 `every_day``watch_interval`) |
### 通知配置 (`notify`)
@@ -142,22 +186,26 @@
此结构可以使用 `sub_routes` 进行嵌套。Feed 会首先尝试匹配子路由;如果没有子路由匹配,则应用父路由的配置。
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :--------------------------------- | :----------- | :-------------------------------------------------------------------------------------------------------- | :----- | :------------ |
| `...matchers` (仅子路由) | `字符串列表` | 标签匹配器,用于确定 Feed 是否属于此子路由。例如 `["category=tech", "source!=github"]`。 | `[]` | 是 (仅子路由) |
| `...receivers` | `字符串列表` | 接收者的名称列表 (在 `notify.receivers` 中定义),用于发送匹配此路由的 Feed 的通知。 | `[]` | 是 (至少一个) |
| `...group_by` | `字符串列表` | 在发送通知前用于对 Feed 进行分组的标签列表。每个分组会产生一个单独的通知。例如 `["source", "category"]`。 | `[]` | 是 (至少一个) |
| `...compress_by_related_threshold` | `*float32` | 如果设置,则根据语义相关性压缩分组内高度相似的 Feed仅发送一个代表。阈值 (0-1),越高表示越相似。 | `0.85` | 否 |
| `...sub_routes` | `对象列表` | 嵌套路由列表。允许定义更具体的路由规则。每个对象遵循 **通知路由配置** | `[]` | 否 |
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :--------------------------------- | :----------- | :-------------------------------------------------------------------------------------------------------- | :---------------------- | :------------ |
| `...matchers` (仅子路由) | `字符串列表` | 标签匹配器,用于确定 Feed 是否属于此子路由。例如 `["category=tech", "source!=github"]`。 | `[]` | 是 (仅子路由) |
| `...receivers` | `字符串列表` | 接收者的名称列表 (在 `notify.receivers` 中定义),用于发送匹配此路由的 Feed 的通知。 | `[]` | 是 (至少一个) |
| `...group_by` | `字符串列表` | 在发送通知前用于对 Feed 进行分组的标签列表。每个分组会产生一个单独的通知。例如 `["source", "category"]`。 | `[]` | 是 (至少一个) |
| `...source_label` | `string` | 从每个 Feed 中提取内容并进行总结的源标签。默认为所有标签。强烈建议设置为 'summary' 以减少上下文长度。 | 所有标签 | 否 |
| `...summary_prompt` | `string` | 用于总结每个分组的 Feed 的 Prompt。 | | 否 |
| `...llm` | `string` | 使用的 LLM 的名称。默认为 `llms` 部分中的默认 LLM。建议使用上下文长度较大的 LLM。 | `llms` 部分中的默认 LLM | 否 |
| `...compress_by_related_threshold` | `*float32` | 如果设置,则根据语义相关性压缩分组内高度相似的 Feed仅发送一个代表。阈值 (0-1),越高表示越相似。 | `0.85` | 否 |
| `...sub_routes` | `对象列表` | 嵌套路由列表。允许定义更具体的路由规则。每个对象遵循 **通知路由配置**。 | `[]` | 否 |
### 通知接收者配置 (`notify.receivers[]`)
定义*谁*接收通知。
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :------------------------- | :------- | :------------------------------- | :----- | :------------------ |
| `notify.receivers[].name` | `string` | 接收者的唯一名称。在路由中使用。 | | 是 |
| `notify.receivers[].email` | `string` | 接收者的电子邮件地址。 | | 是 (如果使用 Email) |
| 字段 | 类型 | 描述 | 默认值 | 是否必需 |
| :--------------------------- | :------- | :------------------------------------------------------- | :----- | :-------------------- |
| `notify.receivers[].name` | `string` | 接收者的唯一名称。在路由中使用。 | | 是 |
| `notify.receivers[].email` | `string` | 接收者的电子邮件地址。 | | 是 (如果使用 Email) |
| `notify.receivers[].webhook` | `object` | 接收者的 Webhook 配置。例如: `webhook: { "url": "xxx" }` | | 是 (如果使用 Webhook) |
### 通知渠道配置 (`notify.channels`)
@@ -175,4 +223,4 @@
| `...email.from` | `string` | 发件人 Email 地址。 | | 是 |
| `...email.password` | `string` | 发件人 Email 的应用专用密码。(对于 Gmail, 参见 [Google 应用密码](https://support.google.com/mail/answer/185833))。 | | 是 |
| `...email.feed_markdown_template` | `string` | 用于在 Email 正文中格式化每个 Feed 的 Markdown 模板。默认渲染 Feed 内容。不能与 `feed_html_snippet_template` 同时设置。可用的模板变量取决于 Feed 标签。 | `{{ .content }}` | 否 |
| `...email.feed_html_snippet_template` | `string` | 用于格式化每个 Feed 的 HTML 片段模板。不能与 `feed_markdown_template` 同时设置。可用的模板变量取决于 Feed 标签。 | | 否 |
| `...email.feed_html_snippet_template` | `string` | 用于格式化每个 Feed 的 HTML 片段模板。不能与 `feed_markdown_template` 同时设置。可用的模板变量取决于 Feed 标签。 | | 否 |

View File

@@ -1,178 +1,226 @@
| Field | Type | Description | Default | Required |
| :------- | :----- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------- | :-------- |
| timezone | string | The timezone of the app. e.g. `Asia/Shanghai`. | server's local timezone | No |
| log | object | The log config. See **Log Configuration** section below. | (see fields) | No |
| api | object | The API config. See **API Configuration** section below. | (see fields) | No |
| llms | list | The LLMs config. Refered by other config sections. See **LLM Configuration** section below. | `[]` | Yes (>=1) |
| scrape | object | The scrape config. See **Scrape Configuration** section below. | (see fields) | No |
| storage | object | The storage config. See **Storage Configuration** section below. | (see fields) | No |
| scheduls | object | The scheduls config for monitoring feeds (aka monitoring rules). See **Scheduls Configuration** section below. | (see fields) | No |
| notify | object | The notify config. It receives results from scheduls, groups them via route config, and sends to receivers via channels. See **Notify Configuration**, **Notify Route**, **Notify Receiver**, **Notify Channels** sections below. | (see fields) | Yes |
| Field | Type | Description | Default Value | Required |
| :---------- | :------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------- | :--------------- |
| `timezone` | `string` | The application's timezone. E.g., `Asia/Shanghai`. | Server local time | No |
| `telemetry` | `object` | Telemetry configuration. See the **Telemetry Configuration** section below. | (See specific fields) | No |
| `api` | `object` | API configuration. See the **API Configuration** section below. | (See specific fields) | No |
| `llms` | `list` | Large Language Model (LLM) configuration. Referenced by other configuration sections. See the **LLM Configuration** section below. | `[]` | Yes (at least 1) |
| `jina` | `object` | Jina AI configuration. See the **Jina AI Configuration** section below. | (See specific fields) | No |
| `scrape` | `object` | Scrape configuration. See the **Scrape Configuration** section below. | (See specific fields) | No |
| `storage` | `object` | Storage configuration. See the **Storage Configuration** section below. | (See specific fields) | No |
| `scheduls` | `object` | Scheduling configuration for monitoring feeds (also known as monitoring rules). See the **Scheduling Configuration** section below. | (See specific fields) | No |
| `notify` | `object` | Notification configuration. It receives results from the scheduling module, groups them via routing configuration, and sends them to notification receivers via notification channels. See the **Notification Configuration**, **Notification Routing**, **Notification Receivers**, **Notification Channels** sections below. | (See specific fields) | Yes |
### Log Configuration (`log`)
### Telemetry Configuration (`telemetry`)
| Field | Type | Description | Default | Required |
| :---------- | :----- | :-------------------------------------------------- | :------ | :------- |
| `log.level` | string | Log level, one of `debug`, `info`, `warn`, `error`. | `info` | No |
| Field | Type | Description | Default Value | Required |
| :-------------------- | :------- | :--------------------------------------------------------------------------------- | :-------------------- | :------- |
| `telemetry.address` | `string` | Exposes Prometheus metrics & pprof. | `:9090` | No |
| `telemetry.log` | `object` | Log configuration related to telemetry. | (See specific fields) | No |
| `telemetry.log.level` | `string` | Log level for telemetry-related messages, one of `debug`, `info`, `warn`, `error`. | `info` | No |
**API Configuration (`api`)**
### API Configuration (`api`)
| Field | Type | Description | Default | Required |
| :----------------- | :----- | :------------------------------------------------------------------------------------------------------------------ | :---------------------------- | :------------------------------------- |
| `api.http` | object | The HTTP API config. | (see fields) | No |
| `api.http.address` | string | The address (`[host]:port`) of the HTTP API. e.g. `0.0.0.0:1300`. Cannot be changed after the app is running. | `:1300` | No |
| `api.mcp` | object | The MCP API config. | (see fields) | No |
| `api.mcp.address` | string | The address (`[host]:port`) of the MCP API. e.g. `0.0.0.0:1301`. Cannot be changed after the app is running. | `:1301` | No |
| `api.llm` | string | The LLM name for summarizing feeds. e.g. `my-favorite-gemini-king`. Refers to an LLM defined in the `llms` section. | default LLM in `llms` section | Yes (if summarization feature is used) |
| Field | Type | Description | Default Value | Required |
| :----------------- | :------- | :--------------------------------------------------------------------------------------------------------------------------- | :---------------------------- | :--------------------- |
| `api.http` | `object` | HTTP API configuration. | (See specific fields) | No |
| `api.http.address` | `string` | Address for the HTTP API (`[host]:port`). E.g., `0.0.0.0:1300`. Cannot be changed after the application starts. | `:1300` | No |
| `api.mcp` | `object` | MCP API configuration. | (See specific fields) | No |
| `api.mcp.address` | `string` | Address for the MCP API (`[host]:port`). E.g., `0.0.0.0:1301`. Cannot be changed after the application starts. | `:1301` | No |
| `api.llm` | `string` | Name of the LLM used for summarizing feeds. E.g., `my-favorite-gemini-king`. Refers to an LLM defined in the `llms` section. | Default LLM in `llms` section | Yes (if using summary) |
### LLM Configuration (`llms[]`)
This section defines a list of available Large Language Models. At least one LLM configuration is required.
This section defines the list of available Large Language Models. At least one LLM configuration is required.
| Field | Type | Description | Default | Required |
| :----------------------- | :------ | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------- | :------------------------------------------------------------- |
| `llms[].name` | string | The name (or 'id') of the LLM. e.g. `my-favorite-gemini-king`. Used to refer to this LLM in other sections (`api.llm`, `storage.feed.embedding_llm`, etc.). | | Yes |
| `llms[].default` | bool | Whether this LLM is the default LLM. Only one LLM can be the default. | `false` | No (but one must be `true` if default behavior is relied upon) |
| `llms[].provider` | string | The provider of the LLM, one of `openai`, `openrouter`, `deepseek`, `gemini`, `volc`, `siliconflow`. e.g. `openai`. | | Yes |
| `llms[].endpoint` | string | The custom endpoint of the LLM. e.g. `https://api.openai.com/v1`. | (provider specific default) | No |
| `llms[].api_key` | string | The API key of the LLM. | | Yes |
| `llms[].model` | string | The model of the LLM. e.g. `gpt-4o-mini`. Cannot be empty if used for generation tasks (like summarization). Cannot be empty with `embedding_model` at same time if this LLM is used. | | Conditionally Yes |
| `llms[].embedding_model` | string | The embedding model of the LLM. e.g. `text-embedding-3-small`. Cannot be empty if used for embedding. Cannot be empty with `model` at same time if this LLM is used. **NOTE:** Do not modify after initial use; add a new LLM config instead. | | Conditionally Yes |
| `llms[].temperature` | float32 | The temperature (0-2) of the LLM. | `0.0` | No |
| Field | Type | Description | Default Value | Required |
| :----------------------- | :-------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :-------------------------- | :--------------------------------------------------------- |
| `llms[].name` | `string` | Name (or 'id') of the LLM. E.g., `my-favorite-gemini-king`. Used to refer to this LLM in other configuration sections (e.g., `api.llm`, `storage.feed.embedding_llm`). | | Yes |
| `llms[].default` | `bool` | Whether this LLM is the default LLM. Only one LLM can be the default. | `false` | No (but one must be `true` if relying on default behavior) |
| `llms[].provider` | `string` | Provider of the LLM, one of `openai`, `openrouter`, `deepseek`, `gemini`, `volc`, `siliconflow`. E.g., `openai`. | | Yes |
| `llms[].endpoint` | `string` | Custom endpoint for the LLM. E.g., `https://api.openai.com/v1`. | (Provider-specific default) | No |
| `llms[].api_key` | `string` | API key for the LLM. | | Yes |
| `llms[].model` | `string` | Model of the LLM. E.g., `gpt-4o-mini`. Cannot be empty if used for generation tasks (e.g., summarization). If this LLM is used, cannot be empty along with `embedding_model`. | | Conditionally Required |
| `llms[].embedding_model` | `string` | Embedding model of the LLM. E.g., `text-embedding-3-small`. Cannot be empty if used for embedding. If this LLM is used, cannot be empty along with `model`. **Note:** Do not modify directly after initial use; add a new LLM configuration instead. | | Conditionally Required |
| `llms[].tts_model` | `string` | The Text-to-Speech (TTS) model of the LLM. | | No |
| `llms[].temperature` | `float32` | Temperature of the LLM (0-2). | `0.0` | No |
### Jina AI Configuration (`jina`)
This section configures parameters related to the Jina AI Reader API, primarily used by the `crawl_by_jina` type in rewrite rules.
| Field | Type | Description | Default Value | Required |
| :----------- | :------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------ | :------- |
| `jina.token` | `string` | API Token for Jina AI. Obtain from [Jina AI API Dashboard](https://jina.ai/api-dashboard/). Providing a token grants higher service rate limits. If left empty, requests will be made as an anonymous user with lower rate limits. | | No |
### Scrape Configuration (`scrape`)
| Field | Type | Description | Default | Required |
| :----------------------- | :-------------- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------ | :-------------------------------- |
| `scrape.past` | duration | The lookback time window for scraping feeds. e.g. `1h` means only scrape feeds in the past 1 hour. | `3d` | No |
| `scrape.interval` | duration | How often to scrape each source (global default). e.g. `1h`. | `1h` | No |
| `scrape.rsshub_endpoint` | string | The endpoint of the RSSHub. You can deploy your own or use a public one (see [RSSHub Docs](https://docs.rsshub.app/guide/instances)). e.g. `https://rsshub.app`. | | Yes (if `rsshub_route_path` used) |
| `scrape.sources` | list of objects | The sources for scraping feeds. See **Scrape Source Configuration** below. | `[]` | Yes (at least one) |
| Field | Type | Description | Default Value | Required |
| :----------------------- | :---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------ | :----------------------------------- |
| `scrape.past` | `time.Duration` | Time window to look back when scraping feeds. E.g., `1h` means only scrape feeds from the past 1 hour. | `24h` | No |
| `scrape.interval` | `time.Duration` | Frequency to scrape each source (global default). E.g., `1h`. | `1h` | No |
| `scrape.rsshub_endpoint` | `string` | Endpoint for RSSHub. You can deploy your own RSSHub server or use a public instance (see [RSSHub Documentation](https://docs.rsshub.app/guide/instances)). E.g., `https://rsshub.app`. | | Yes (if `rsshub_route_path` is used) |
| `scrape.rsshub_access_key` | `string` | The access key for RSSHub. Used for access control. (see [RSSHub config](https://docs.rsshub.app/deploy/config#access-control-configurations))| | No |
| `scrape.sources` | `list of objects` | List of sources to scrape feeds from. See **Scrape Source Configuration** below. | `[]` | Yes (at least one) |
### Scrape Source Configuration (`scrape.sources[]`)
Describes each source to be scraped.
| Field | Type | Description | Default | Required |
| :-------------------------- | :---------------- | :------------------------------------------------------------------------------------------------------------------------------------- | :-------------- | :-------------------------- |
| `scrape.sources[].interval` | duration | How often to scrape this specific source. Overrides the global `scrape.interval`. | global interval | No |
| `scrape.sources[].name` | string | The name of the source. Used for labeling feeds. | | Yes |
| `scrape.sources[].labels` | map[string]string | Additional key-value labels to add to feeds from this source. | `{}` | No |
| `scrape.sources[].rss` | object | The RSS config for this source. See **Scrape Source RSS Configuration** below. Only one source type (e.g., RSS) can be set per source. | `nil` | Yes (if source type is RSS) |
| Field | Type | Description | Default Value | Required |
| :-------------------------- | :------------------ | :------------------------------------------------------------------------------------------------------------------------------------ | :---------------- | :-------------------------- |
| `scrape.sources[].interval` | `time.Duration` | Frequency to scrape this specific source. Overrides global `scrape.interval`. | Global `interval` | No |
| `scrape.sources[].name` | `string` | Name of the source. Used to tag feeds. | | Yes |
| `scrape.sources[].labels` | `map[string]string` | Additional key-value labels to attach to feeds from this source. | `{}` | No |
| `scrape.sources[].rss` | `object` | RSS configuration for this source. See **Scrape Source RSS Configuration** below. Each source can only have one type set (e.g., RSS). | `nil` | Yes (if source type is RSS) |
### Scrape Source RSS Configuration (`scrape.sources[].rss`)
| Field | Type | Description | Default | Required |
| :--------------------------------------- | :----- | :------------------------------------------------------------------------------------------------------------------------------------ | :------ | :---------------------------------------------------- |
| `scrape.sources[].rss.url` | string | The full URL of the RSS feed. e.g. `http://localhost:1200/github/trending/daily/any`. Cannot be set if `rsshub_route_path` is set. | | Yes (unless `rsshub_route_path` is set) |
| `scrape.sources[].rss.rsshub_route_path` | string | The RSSHub route path. e.g. `github/trending/daily/any`. Will be joined with `scrape.rsshub_endpoint`. Cannot be set if `url` is set. | | Yes (unless `url` is set, requires `rsshub_endpoint`) |
| Field | Type | Description | Default Value | Required |
| :--------------------------------------- | :------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------ | :-------------------------------------------------------- |
| `scrape.sources[].rss.url` | `string` | Full URL of the RSS feed. E.g., `http://localhost:1200/github/trending/daily/any`. Cannot be set if `rsshub_route_path` is set. | | Yes (unless `rsshub_route_path` is set) |
| `scrape.sources[].rss.rsshub_route_path` | `string` | RSSHub route path. E.g., `github/trending/daily/any`. Will be concatenated with `scrape.rsshub_endpoint` to form the final URL. Cannot be set if `url` is set. | | Yes (unless `url` is set, and requires `rsshub_endpoint`) |
### Storage Configuration (`storage`)
| Field | Type | Description | Default | Required |
| :------------- | :----- | :------------------------------------------------------------------------------- | :----------- | :------- |
| `storage.dir` | string | The base directory for all storages. Cannot be changed after the app is running. | `./data` | No |
| `storage.feed` | object | The feed storage config. See **Feed Storage Configuration** below. | (see fields) | No |
| Field | Type | Description | Default Value | Required |
| :--------------- | :------- | :-------------------------------------------------------------------------------------------------------- | :-------------------- | :------- |
| `storage.dir` | `string` | Base directory for all storage. Cannot be changed after the application starts. | `./data` | No |
| `storage.feed` | `object` | Feed storage configuration. See **Feed Storage Configuration** below. | (See specific fields) | No |
| `storage.object` | `object` | Object storage configuration for storing files like podcasts. See **Object Storage Configuration** below. | (See specific fields) | No |
### Feed Storage Configuration (`storage.feed`)
| Field | Type | Description | Default | Required |
| :---------------------------- | :-------------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------- | :------- |
| `storage.feed.rewrites` | list of objects | How to process each feed before storing it. Inspired by Prometheus relabeling. See **Rewrite Rule Configuration** below. | `[]` | No |
| `storage.feed.flush_interval` | duration | How often to flush feed storage to the database. Higher value risks data loss but improves performance. | `200ms` | No |
| `storage.feed.embedding_llm` | string | The name of the LLM (from `llms` section) used for embedding feeds. Affects semantic search accuracy. **NOTE:** If changing, keep the old LLM config defined as past data relies on it. | default LLM in `llms` section | No |
| `storage.feed.retention` | duration | How long to keep a feed. | `8d` | No |
| `storage.feed.block_duration` | duration | How long to keep each time-based feed storage block (similar to Prometheus TSDB Block). | `25h` | No |
| Field | Type | Description | Default Value | Required |
| :---------------------------- | :---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :---------------------------- | :------- |
| `storage.feed.rewrites` | `list of objects` | How to process each feed before storing it. Inspired by Prometheus relabeling. See **Rewrite Rule Configuration** below. | `[]` | No |
| `storage.feed.flush_interval` | `time.Duration` | Frequency to flush feed storage to the database. Higher values risk more data loss but reduce disk operations and improve performance. | `200ms` | No |
| `storage.feed.embedding_llm` | `string` | Name of the LLM used for feed embedding (from `llms` section). Significantly impacts semantic search accuracy. **Note:** If switching, keep the old LLM configuration as past data is implicitly associated with it, otherwise past data cannot be semantically searched. | Default LLM in `llms` section | No |
| `storage.feed.retention` | `time.Duration` | Retention duration for feeds. | `8d` | No |
| `storage.feed.block_duration` | `time.Duration` | Retention duration for each time-based feed storage block (similar to Prometheus TSDB Block). | `25h` | No |
### Object Storage Configuration (`storage.object`)
| Field | Type | Description | Default Value | Required |
| :--------------------------------- | :------- | :------------------------------------------- | :------------ | :----------------------------- |
| `storage.object.endpoint` | `string` | The endpoint of the object storage. | | Yes (if using podcast feature) |
| `storage.object.access_key_id` | `string` | The access key id of the object storage. | | Yes (if using podcast feature) |
| `storage.object.secret_access_key` | `string` | The secret access key of the object storage. | | Yes (if using podcast feature) |
| `storage.object.bucket` | `string` | The bucket of the object storage. | | Yes (if using podcast feature) |
| `storage.object.bucket` | `string` | The URL of the object storage bucket. | | No |
### Rewrite Rule Configuration (`storage.feed.rewrites[]`)
Defines rules to process feeds before storage. Rules are applied in order.
Defines rules to process feeds before storage. Rules are applied sequentially.
| Field | Type | Description | Default | Required |
| :--------------------------------------- | :----- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :----------------------- | :-------------------------------------------- |
| `...rewrites[].source_label` | string | The feed label to use as the source text for transformation. Default labels: `type`, `source`, `title`, `link`, `pub_time`, `content`. | `content` | No |
| `...rewrites[].skip_too_short_threshold` | *int | If set, feeds where the `source_label` text length is below this threshold are skipped by this rule (processing continues with the next rule or feed storage if no more rules). Helps filter short/uninformative feeds. | `300` | No |
| `...rewrites[].transform` | object | Configures how to transform the `source_label` text. See **Rewrite Rule Transform Configuration** below. If unset, the `source_label` text is used directly for matching. | `nil` | No |
| `...rewrites[].match` | string | A simple string to match against the (transformed) text. Cannot be set with `match_re`. | | No (use `match` or `match_re`) |
| `...rewrites[].match_re` | string | A regular expression to match against the (transformed) text. | `.*` (matches all) | No (use `match` or `match_re`) |
| `...rewrites[].action` | string | Action to perform if matched: `create_or_update_label` (adds/updates a label with the matched/transformed text), `drop_feed` (discards the feed entirely). | `create_or_update_label` | No |
| `...rewrites[].label` | string | The feed label name to create or update. | | Yes (if `action` is `create_or_update_label`) |
| Field | Type | Description | Default Value | Required |
| :--------------------------------------- | :---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :----------------------- | :-------------------------------------------- |
| `...rewrites[].if` | `list of strings` | Conditions to match feeds. If not set, matches all feeds. Similar to label filters, e.g., `["source=github", "title!=xxx"]`. If conditions are not met, this rule is skipped. | `[]` (matches all) | No |
| `...rewrites[].source_label` | `string` | Feed label used as the source text for transformation. Default labels include: `type`, `source`, `title`, `link`, `pub_time`, `content`. | `content` | No |
| `...rewrites[].skip_too_short_threshold` | `*int` | If set, feeds where the `source_label` text length is below this threshold will be skipped by this rule (processing continues to the next rule, or feed storage if no more rules). Helps filter out feeds that are too short/uninformative. | `300` | No |
| `...rewrites[].transform` | `object` | Configures how to transform the `source_label` text. See **Rewrite Rule Transform Configuration** below. If not set, the `source_label` text is used directly for matching. | `nil` | No |
| `...rewrites[].match` | `string` | Simple string to match against the (transformed) text. Cannot be set with `match_re`. | | No (use `match` or `match_re`) |
| `...rewrites[].match_re` | `string` | Regular expression to match against the (transformed) text. | `.*` (matches all) | No (use `match` or `match_re`) |
| `...rewrites[].action` | `string` | Action to perform on match: `create_or_update_label` (adds/updates a label with the matched/transformed text), `drop_feed` (discards the feed entirely). | `create_or_update_label` | No |
| `...rewrites[].label` | `string` | Name of the feed label to create or update. | | Yes (if `action` is `create_or_update_label`) |
| `...transform.to_text` | `object` | Transforms source text to text using an LLM. See **Rewrite Rule To Text Configuration** below. | `nil` | No |
| `...transform.to_podcast` | `object` | Transforms source text to a podcast. See **Rewrite Rule To Podcast Configuration** below. | `nil` | No |
### Rewrite Rule Transform Configuration (`storage.feed.rewrites[].transform`)
### Rewrite Rule To Text Configuration (`storage.feed.rewrites[].transform.to_text`)
| Field | Type | Description | Default | Required |
| :--------------------- | :----- | :---------------------------------------------------------------------------------------------------------- | :------ | :------- |
| `...transform.to_text` | object | Transform the source text to text using an LLM. See **Rewrite Rule Transform To Text Configuration** below. | `nil` | No |
This configuration defines how to transform the text from `source_label`.
### Rewrite Rule Transform To Text Configuration (`storage.feed.rewrites[].transform.to_text`)
| Field | Type | Description | Default Value | Required |
| :------------------ | :------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------- | :-------------------------- |
| `...to_text.type` | `string` | Type of transformation. Options: <ul><li>`prompt` (default): Uses an LLM and a specified prompt to transform the source text.</li><li>`crawl`: Treats the source text as a URL, directly crawls the web page content pointed to by the URL, and converts it to Markdown format. This method performs local crawling and attempts to follow `robots.txt`.</li><li>`crawl_by_jina`: Treats the source text as a URL, crawls and processes web page content via the [Jina AI Reader API](https://jina.ai/reader/), and returns Markdown. Potentially more powerful, e.g., for handling dynamic pages, but relies on the Jina AI service.</li></ul> | `prompt` | No |
| `...to_text.llm` | `string` | **Only valid if `type` is `prompt`.** Name of the LLM used for transformation (from `llms` section). If not specified, the LLM marked as `default: true` in the `llms` section will be used. | Default LLM in `llms` section | No |
| `...to_text.prompt` | `string` | **Only valid if `type` is `prompt`.** Prompt used for transformation. The source text will be injected. You can use Go template syntax to reference built-in prompts: `{{ .summary }}`, `{{ .category }}`, `{{ .tags }}`, `{{ .score }}`, `{{ .comment_confucius }}`, `{{ .summary_html_snippet }}`, `{{ .summary_html_snippet_for_small_model }}`. | | Yes (if `type` is `prompt`) |
| Field | Type | Description | Default | Required |
| :------------------ | :----- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | :---------------------------- | :------- |
| `...to_text.llm` | string | The name of the LLM (from `llms` section) to use for transformation. | default LLM in `llms` section | No |
| `...to_text.prompt` | string | The prompt used for transformation. The source text is injected. Go template syntax can refer to built-in prompts: `{{ .summary }}`, `{{ .category }}`, `{{ .tags }}`, `{{ .score }}`, `{{ .comment_confucius }}`, `{{ .summary_html_snippet }}`. | | Yes |
### Rewrite Rule To Podcast Configuration (`storage.feed.rewrites[].transform.to_podcast`)
### Scheduls Configuration (`scheduls`)
This configuration defines how to transform the text from `source_label` into a podcast.
| Field | Type | Description | Default Value | Required |
| :------------------------------------------- | :---------------- | :--------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------- | :------- |
| `...to_podcast.llm` | `string` | The name of the LLM (from the `llms` section) to use for generating the podcast script. | Default LLM in `llms` section | No |
| `...to_podcast.transcript_additional_prompt` | `string` | Additional instructions to append to the prompt for generating the podcast script. | | No |
| `...to_podcast.tts_llm` | `string` | The name of the LLM (from the `llms` section) to use for Text-to-Speech (TTS). **Note: Currently only supports LLMs with `provider: gemini`**. | Default LLM in `llms` section | No |
| `...to_podcast.speakers` | `list of objects` | A list of speakers for the podcast. See **Speaker Configuration** below. | `[]` | Yes |
#### Speaker Configuration (`...to_podcast.speakers[]`)
| Field | Type | Description | Default Value | Required |
| :-------------------- | :------- | :----------------------------------- | :------------ | :------- |
| `...speakers[].name` | `string` | The name of the speaker. | | Yes |
| `...speakers[].role` | `string` | The role description of the speaker. | | No |
| `...speakers[].voice` | `string` | The voice of the speaker. | | Yes |
### Scheduling Configuration (`scheduls`)
Defines rules for querying and monitoring feeds.
| Field | Type | Description | Default | Required |
| :--------------- | :-------------- | :------------------------------------------------------------------------------------------------------------------------------------------------- | :------ | :------- |
| `scheduls.rules` | list of objects | The rules for scheduling feeds. Each rule's result (matched feeds) is sent to the notify route. See **Scheduls Rule Configuration** section below. | `[]` | No |
| Field | Type | Description | Default Value | Required |
| :--------------- | :---------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------ | :------- |
| `scheduls.rules` | `list of objects` | List of rules for scheduling feeds. The results of each rule (matched feeds) will be sent to notification routes. See **Scheduling Rule Configuration** below. | `[]` | No |
### Scheduls Rule Configuration (`scheduls.rules[]`)
### Scheduling Rule Configuration (`scheduls.rules[]`)
| Field | Type | Description | Default | Required |
| :-------------------------------- | :-------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------ | :--------------------------------------- |
| `scheduls.rules[].name` | string | The name of the rule. | | Yes |
| `scheduls.rules[].query` | string | The semantic query to find relevant feeds. Optional. | | No |
| `scheduls.rules[].threshold` | float32 | Relevance score threshold (0-1) to filter semantic query results. Only works if `query` is set. | `0.6` | No |
| `scheduls.rules[].label_filters` | list of strings | Filters based on feed labels (exact match or non-match). e.g. `["category=tech", "source!=github"]`. | `[]` | No |
| `scheduls.rules[].every_day` | string | Query range relative to the end of each day. Format: `start~end` (HH:MM). e.g., `00:00~23:59` (today), `-22:00~07:00` (yesterday 22:00 to today 07:00). Cannot be set with `watch_interval`. | | No (use `every_day` or `watch_interval`) |
| `scheduls.rules[].watch_interval` | duration | How often to run the query. e.g. `10m`. Cannot be set with `every_day`. | `10m` | No (use `every_day` or `watch_interval`) |
| Field | Type | Description | Default Value | Required |
| :-------------------------------- | :------------------ | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------------ | :--------------------------------------- |
| `scheduls.rules[].name` | `string` | Name of the rule. | | Yes |
| `scheduls.rules[].query` | `string` | Semantic query to find relevant feeds. Optional. | | No |
| `scheduls.rules[].threshold` | `float32` | Relevance score threshold (0-1) for filtering semantic query results. Only effective if `query` is set. | `0.6` | No |
| `scheduls.rules[].label_filters` | `list of strings` | Filters based on feed labels (equals or not equals). E.g., `["category=tech", "source!=github"]`. | `[]` | No |
| `scheduls.rules[].labels` | `map[string]string` | Additional key-value labels attached to this source Feed. | `{}` | No |
| `scheduls.rules[].every_day` | `string` | Query range relative to the end of each day. Format: `start~end` (HH:MM). E.g., `00:00~23:59` (today), `-22:00~07:00` (yesterday 22:00 to today 07:00). Cannot be set with `watch_interval`. | | No (use `every_day` or `watch_interval`) |
| `scheduls.rules[].watch_interval` | `time.Duration` | Frequency to run the query. E.g., `10m`. Cannot be set with `every_day`. | `10m` | No (use `every_day` or `watch_interval`) |
### Notify Configuration (`notify`)
### Notification Configuration (`notify`)
| Field | Type | Description | Default | Required |
| :----------------- | :-------------- | :------------------------------------------------------------------------------------------------------------- | :----------- | :---------------------- |
| `notify.route` | object | The main notify routing configuration. See **Notify Route Configuration** below. | (see fields) | Yes |
| `notify.receivers` | list of objects | Defines the notification receivers (e.g., email addresses). See **Notify Receiver Configuration** below. | `[]` | Yes (at least one) |
| `notify.channels` | object | Configures the notification channels (e.g., email SMTP settings). See **Notify Channels Configuration** below. | (see fields) | Yes (if using channels) |
| Field | Type | Description | Default Value | Required |
| :----------------- | :---------------- | :-------------------------------------------------------------------------------------------------------------- | :-------------------- | :---------------------- |
| `notify.route` | `object` | Main notification routing configuration. See **Notification Routing Configuration** below. | (See specific fields) | Yes |
| `notify.receivers` | `list of objects` | Defines notification receivers (e.g., email addresses). See **Notification Receiver Configuration** below. | `[]` | Yes (at least one) |
| `notify.channels` | `object` | Configures notification channels (e.g., email SMTP settings). See **Notification Channel Configuration** below. | (See specific fields) | Yes (if using channels) |
### Notify Route Configuration (`notify.route` and `notify.route.sub_routes[]`)
### Notification Routing Configuration (`notify.route` and `notify.route.sub_routes[]`)
This structure can be nested using `sub_routes`. A feed is matched against sub-routes first; if no sub-route matches, the parent route's configuration applies.
This structure can be nested using `sub_routes`. Feeds will first try to match sub-routes; if no sub-route matches, the parent route's configuration is applied.
| Field | Type | Description | Default | Required |
| :--------------------------------- | :-------------- | :------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :------ | :------------------- |
| `...matchers` (only in sub-routes) | list of strings | Label matchers to determine if a feed belongs to this sub-route. e.g. `["category=tech", "source!=github"]`. | `[]` | Yes (for sub-routes) |
| `...receivers` | list of strings | Names of the receivers (defined in `notify.receivers`) to send notifications for feeds matching this route. | `[]` | Yes (at least one) |
| `...group_by` | list of strings | Labels to group feeds by before sending notifications. Each group results in a separate notification. e.g., `["source", "category"]`. | `[]` | Yes (at least one) |
| `...compress_by_related_threshold` | *float32 | If set, compresses highly similar feeds (based on semantic relatedness) within a group, sending only one representative. Threshold (0-1). Higher means more similar. | `0.85` | No |
| `...sub_routes` | list of objects | Nested routes. Allows defining more specific routing rules. Each object follows the **Notify Route Configuration**. | `[]` | No |
| Field | Type | Description | Default Value | Required |
| :--------------------------------- | :---------------- | :----------------------------------------------------------------------------------------------------------------------------------------------------------------- | :---------------------------- | :-------------------- |
| `...matchers` (sub-routes only) | `list of strings` | Label matchers to determine if a feed belongs to this sub-route. E.g., `["category=tech", "source!=github"]`. | `[]` | Yes (sub-routes only) |
| `...receivers` | `list of strings` | List of receiver names (defined in `notify.receivers`) to send notifications for feeds matching this route. | `[]` | Yes (at least one) |
| `...group_by` | `list of strings` | List of labels to group feeds by before sending notifications. Each group results in a separate notification. E.g., `["source", "category"]`. | `[]` | Yes (at least one) |
| `...source_label` | `string` | Source label to extract content from each feed for summarization. Defaults to all labels. Strongly recommended to set to 'summary' to reduce context length. | All labels | No |
| `...summary_prompt` | `string` | Prompt to summarize feeds for each group. | | No |
| `...llm` | `string` | Name of the LLM to use. Defaults to the default LLM in the `llms` section. Recommended to use an LLM with a large context length. | Default LLM in `llms` section | No |
| `...compress_by_related_threshold` | `*float32` | If set, compresses highly similar feeds within a group based on semantic relatedness, sending only one representative. Threshold (0-1), higher means more similar. | `0.85` | No |
| `...sub_routes` | `list of objects` | List of nested routes. Allows defining more specific routing rules. Each object follows **Notification Routing Configuration**. | `[]` | No |
### Notify Receiver Configuration (`notify.receivers[]`)
### Notification Receiver Configuration (`notify.receivers[]`)
Defines *who* receives notifications.
| Field | Type | Description | Default | Required |
| :------------------------- | :----- | :----------------------------------------------- | :------ | :------------------- |
| `notify.receivers[].name` | string | The unique name of the receiver. Used in routes. | | Yes |
| `notify.receivers[].email` | string | The email address of the receiver. | | Yes (if using email) |
| Field | Type | Description | Default Value | Required |
| :--------------------------- | :------- | :----------------------------------------------------------------------- | :------------ | :--------------------- |
| `notify.receivers[].name` | `string` | Unique name of the receiver. Used in routes. | | Yes |
| `notify.receivers[].email` | `string` | Email address of the receiver. | | Yes (if using Email) |
| `notify.receivers[].webhook` | `object` | Webhook configuration for the receiver. E.g. `webhook: { "url": "xxx" }` | | Yes (if using Webhook) |
### Notify Channels Configuration (`notify.channels`)
### Notification Channel Configuration (`notify.channels`)
Configures *how* notifications are sent.
| Field | Type | Description | Default | Required |
| :---------------------- | :----- | :--------------------------------------------------------------------------------- | :------ | :------------------- |
| `notify.channels.email` | object | The global email channel config. See **Notify Channel Email Configuration** below. | `nil` | Yes (if using email) |
| Field | Type | Description | Default Value | Required |
| :---------------------- | :------- | :------------------------------------------------------------------------------------------ | :------------ | :------------------- |
| `notify.channels.email` | `object` | Global Email channel configuration. See **Notification Channel Email Configuration** below. | `nil` | Yes (if using Email) |
### Notify Channel Email Configuration (`notify.channels.email`)
### Notification Channel Email Configuration (`notify.channels.email`)
| Field | Type | Description | Default | Required |
| :------------------------------------ | :----- | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------------- | :------- |
| `...email.smtp_endpoint` | string | The SMTP server endpoint. e.g. `smtp.gmail.com:587`. | | Yes |
| `...email.from` | string | The sender email address. | | Yes |
| `...email.password` | string | The application password for the sender email. (For Gmail, see [Google App Passwords](https://support.google.com/mail/answer/185833)). | | Yes |
| `...email.feed_markdown_template` | string | Markdown template for formatting each feed in the email body. Default renders the feed content. Cannot be set with `feed_html_snippet_template`. Available template variables depend on feed labels. | `{{ .content }}` | No |
| `...email.feed_html_snippet_template` | string | HTML snippet template for formatting each feed. Cannot be set with `feed_markdown_template`. Available template variables depend on feed labels. | | No |
| Field | Type | Description | Default Value | Required |
| :------------------------------------ | :------- | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | :--------------- | :------- |
| `...email.smtp_endpoint` | `string` | SMTP server endpoint. E.g., `smtp.gmail.com:587`. | | Yes |
| `...email.from` | `string` | Sender's email address. | | Yes |
| `...email.password` | `string` | App-specific password for the sender's email. (For Gmail, see [Google App Passwords](https://support.google.com/mail/answer/185833)). | | Yes |
| `...email.feed_markdown_template` | `string` | Markdown template for formatting each feed in the email body. Renders feed content by default. Cannot be set with `feed_html_snippet_template`. Available template variables depend on feed labels. | `{{ .content }}` | No |
| `...email.feed_html_snippet_template` | `string` | HTML snippet template for formatting each feed. Cannot be set with `feed_markdown_template`. Available template variables depend on feed labels. | | No |

88
docs/crawl-zh.md Normal file
View File

@@ -0,0 +1,88 @@
# 使用 Zenfeed 爬虫功能
Zenfeed 提供了将网页内容抓取并转换为 Markdown 格式的功能。这主要通过重写规则 (`rewrites` rule) 中的 `transform.to_text.type` 配置项实现。
## 如何使用
在你的配置文件中,找到 `storage.feed.rewrites` 部分。当你定义一条重写规则时,可以通过 `transform` 字段来启用爬虫功能。
具体配置如下:
```yaml
storage:
feed:
rewrites:
- if: ["source=xxx", ...]
source_label: "link" # 指定包含 URL 的标签,例如 feed 中的 'link' 标签
transform:
to_text:
type: "crawl" # 或 "crawl_by_jina"
# llm: "your-llm-name" # crawl 类型不需要 llm
# prompt: "your-prompt" # crawl 类型不需要 prompt
# match: ".*" # 可选:对抓取到的 Markdown 内容进行匹配
action: "create_or_update_label" # 对抓取到的内容执行的动作
label: "crawled_content" # 将抓取到的 Markdown 存储到这个新标签
# ... 其他配置 ...
jina: # 如果使用 crawl_by_jina并且需要更高的速率限制匿名ip: 20 RPM请配置 Jina API Token
token: "YOUR_JINA_AI_TOKEN" # 从 https://jina.ai/api-dashboard/ 获取
```
### 转换类型 (`transform.to_text.type`)
你有以下几种选择:
1. **`crawl`**:
* Zenfeed 将使用内置的本地爬虫尝试抓取 `source_label` 中指定的 URL。
* 它会尝试遵循目标网站的 `robots.txt` 协议。
* 适用于静态网页或结构相对简单的网站。
2. **`crawl_by_jina`**:
* Zenfeed 将通过 [Jina AI Reader API](https://jina.ai/reader/) 来抓取和处理 `source_label` 中指定的 URL。
* Jina AI 可能能更好地处理动态内容和复杂网站结构。
* 同样遵循目标网站的 `robots.txt` 协议。
* **依赖 Jina AI 服务**
* 建议在配置文件的顶层添加 `jina.token` (如上示例) 来提供你的 Jina AI API Token以获得更高的服务速率限制。
* 如果未提供 Token将以匿名用户身份请求速率限制较低。
* 请查阅 Jina AI 的服务条款和隐私政策。
### 关键配置说明
* `source_label`: 此标签的值**必须是一个有效的 URL**。例如,如果你的 RSS Feed 中的 `link` 标签指向的是一篇包含完整文章的网页,你可以将 `source_label` 设置为 `link`
* `action`: 通常设置为 `create_or_update_label`,将抓取并转换后的 Markdown 内容存入一个新的标签中(由 `label` 字段指定)。
* `label`: 指定存储抓取到的 Markdown 内容的新标签名称。
## 使用场景
**全文内容提取**:
很多 RSS 源只提供文章摘要和原文链接。使用爬虫功能可以将原文完整内容抓取下来,转换为 Markdown 格式,方便后续的 AI 处理(如总结、打标签、分类等)或直接阅读。
## 免责声明
**在使用 Zenfeed 的爬虫功能(包括 `crawl` 和 `crawl_by_jina` 类型)前,请仔细阅读并理解以下声明。您的使用行为即表示您已接受本声明的所有条款。**
1. **用户责任与授权**:
* 您将对使用爬虫功能的所有行为承担全部责任。
* 您必须确保拥有访问、抓取和处理所提供 URL 内容的合法权利。
* 请严格遵守目标网站的 `robots.txt` 协议、服务条款 (ToS)、版权政策以及所有相关的法律法规。
* 不得使用本功能处理、存储或分发任何非法、侵权、诽谤、淫秽或其他令人反感的内容。
2. **内容准确性与完整性**:
* 网页抓取和 Markdown 转换过程的结果可能不准确、不完整或存在偏差。这可能受到目标网站结构、反爬虫机制、动态内容渲染、网络问题等多种因素的影响。
* Zenfeed 项目作者和贡献者不对抓取内容的准确性、完整性、及时性或质量作任何保证。
3. **第三方服务依赖 (`crawl_by_jina`)**:
* `crawl_by_jina` 功能依赖于 Jina AI 提供的第三方服务。
* Jina AI 服务的可用性、性能、数据处理政策、服务条款以及可能的费用(超出免费额度后)均由 Jina AI 自行决定。
* 项目作者和贡献者不对 Jina AI 服务的任何方面负责。请在使用前查阅 [Jina AI 的相关条款](https://jina.ai/terms/) 和 [隐私政策](https://jina.ai/privacy/)。
4. **无间接或后果性损害赔偿**:
* 在任何情况下,无论基于何种法律理论,项目作者和贡献者均不对因使用或无法使用爬虫功能而导致的任何直接、间接、偶然、特殊、惩戒性或后果性损害负责,包括但不限于利润损失、数据丢失、商誉损失或业务中断。
5. **法律与合规风险**:
* 未经授权抓取、复制、存储、处理或传播受版权保护的内容,或违反网站服务条款的行为,可能违反相关法律法规,并可能导致法律纠纷或处罚。
* 用户需自行承担因使用爬虫功能而产生的所有法律风险和责任。
6. **"按原样"提供**:
* 爬虫功能按"现状"和"可用"的基础提供,不附带任何形式的明示或默示担保。
**强烈建议您在启用和配置爬虫功能前仔细评估相关风险并确保您的使用行为完全合法合规。对于任何因用户滥用或不当使用本软件包括爬虫功能而引起的法律纠纷、损失或损害Zenfeed 项目作者和贡献者不承担任何责任。**

BIN
docs/images/302.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

BIN
docs/images/crad.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 617 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 570 KiB

After

Width:  |  Height:  |  Size: 522 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.1 MiB

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
docs/images/folo-html.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 157 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 197 KiB

After

Width:  |  Height:  |  Size: 176 KiB

BIN
docs/images/sponsor.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 897 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 131 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 688 KiB

View File

@@ -1,11 +1,11 @@
## 从 Follow 导出 OPML 文件
## Export OPML File from Follow
<img src="images/migrate-from-follow-1.png" alt="" width="300">
<img src="images/migrate-from-follow-2.png" alt="" width="500">
<img src="images/migrate-from-follow-3.png" alt="" width="500">
> 注意一定要填写 http://rsshub:1200
> Note: Make sure to fill in http://rsshub:1200
## 导入 zenfeed-web
## Import to zenfeed-web
<img src="images/migrate-from-follow-4.png" alt="" width="500">
<img src="images/migrate-from-follow-5.png" alt="" width="500">

View File

@@ -0,0 +1,16 @@
如果无需使用 HTML 总结,模型可以随便选择
## 背景 & 原则
* Token 使用会很多,你可以想象每篇 RSS 都总结一遍会有多少消耗。所以优先选择免费模型,或者按次计费
* HTML 生成对模型有较高要求。所以你现在知道了为什么自部署的默认总结效果比不上 https://zenfeed.xyz
* 那为什么不支持 Markdown 呢web 还没精力支持,你可以先用邮件日报替代
* 总结都是后台任务,且支持有状态重试,对模型速率限制 & 稳定性没有要求
* 所以 “1. 质量”“2. 低价”“3. 稳定”。首选 1兼顾 2无需 3
## 如果你对默认的模型效果不满意,首选推荐
* **不缺钱 or “有路子”**Gemini 2.5 Pro
* **再便宜点的**Gemini 2.5 Flash
---
v0.4.0 优化之后,免费 qwen3 的效果应该已经可以满足大部分需求

107
docs/podcast.md Normal file
View File

@@ -0,0 +1,107 @@
# 使用 Zenfeed 将文章转换为播客
Zenfeed 的播客功能可以将任何文章源自动转换为一场引人入胜的多人对话式播客。该功能利用大语言模型LLM生成对话脚本和文本转语音TTS并将最终的音频文件托管在您自己的对象存储中。
## 工作原理
1. **提取内容**: Zenfeed 首先通过重写规则提取文章的全文内容。
2. **生成脚本**: 使用一个指定的 LLM如 GPT-4o-mini将文章内容改编成一个由多位虚拟主播对话的脚本。您可以定义每个主播的角色人设来控制对话风格。
3. **语音合成**: 调用另一个支持 TTS 的 LLM目前仅支持 Google Gemini将脚本中的每一句对话转换为语音。
4. **音频合并**: 将所有语音片段合成为一个完整的 WAV 音频文件。
5. **上传存储**: 将生成的播客文件上传到您配置的 S3 兼容对象存储中。
6. **保存链接**: 最后,将播客文件的公开访问 URL 保存为一个新的 Feed 标签方便您在通知、API 或其他地方使用。
## 配置步骤
要启用播客功能您需要完成以下三项配置LLM、对象存储和重写规则。
### 1. 配置 LLM
您需要至少配置两个 LLM一个用于生成对话脚本另一个用于文本转语音TTS
- **脚本生成 LLM**: 可以是任何性能较好的聊天模型,例如 OpenAI 的 `gpt-4o-mini` 或 Google 的 `gemini-1.5-pro`
- **TTS LLM**: 用于将文本转换为语音。**注意:目前此功能仅支持 `provider``gemini` 的 LLM。**
**示例 `config.yaml`:**
```yaml
llms:
# 用于生成播客脚本的 LLM
- name: openai-chat
provider: openai
api_key: "sk-..."
model: gpt-4o-mini
default: true
# 用于文本转语音 (TTS) 的 LLM
- name: gemini-tts
provider: gemini
api_key: "..." # 你的 Google AI Studio API Key
tts_model: "gemini-2.5-flash-preview-tts" # Gemini 的 TTS 模型
```
### 2. 配置对象存储
生成的播客音频文件需要一个地方存放。Zenfeed 支持任何 S3 兼容的对象存储服务。这里我们以 [Cloudflare R2](https://www.cloudflare.com/zh-cn/products/r2/) 为例。
首先,您需要在 Cloudflare R2 中创建一个存储桶Bucket。然后获取以下信息
- `endpoint`: 您的 R2 API 端点。通常格式为 `<account_id>.r2.cloudflarestorage.com`。您可以在 R2 存储桶的主页找到它。
- `access_key_id``secret_access_key`: R2 API 令牌。您可以在 "R2" -> "管理 R2 API 令牌" 页面创建。
- `bucket`: 您创建的存储桶的名称。
- `bucket_url`: 存储桶的公开访问 URL。要获取此 URL您需要将存储桶连接到一个自定义域或者使用 R2 提供的 `r2.dev` 公开访问地址。
**示例 `config.yaml`:**
```yaml
storage:
object:
endpoint: "<your_account_id>.r2.cloudflarestorage.com"
access_key_id: "..."
secret_access_key: "..."
bucket: "zenfeed-podcasts"
bucket_url: "https://pub-xxxxxxxx.r2.dev"
```
### 3. 配置重写规则
最后一步是创建一个重写规则,告诉 Zenfeed 如何将文章转换为播客。这个规则定义了使用哪个标签作为源文本、由谁来对话、使用什么声音等。
**关键配置项:**
- `source_label`: 包含文章全文的标签。
- `label`: 用于存储最终播客 URL 的新标签名称。
- `transform.to_podcast`: 播客转换的核心配置。
- `llm`: 用于生成脚本的 LLM 名称(来自 `llms` 配置)。
- `tts_llm`: 用于 TTS 的 LLM 名称(来自 `llms` 配置)。
- `speakers`: 定义播客的演讲者。
- `name`: 演讲者的名字。
- `role`: 演讲者的角色和人设,将影响脚本内容。
- `voice`: 演讲者的声音。请参考 [Gemini TTS 文档](https://ai.google.dev/gemini-api/docs/speech-generation#voices)。
**示例 `config.yaml`:**
```yaml
storage:
feed:
rewrites:
- source_label: content # 基于原文
transform:
to_podcast:
estimate_maximum_duration: 3m0s # 接近 3 分钟
transcript_additional_prompt: 对话引人入胜,流畅自然,拒绝 AI 味,使用中文回复 # 脚本内容要求
llm: xxxx # 负责生成脚本的 llm
tts_llm: gemini-tts # 仅支持 gemini tts推荐使用 https://github.com/glidea/one-balance 轮询
speakers:
- name: 小雅
role: >-
一位经验丰富、声音甜美、风格活泼的科技播客主持人。前财经记者、媒体人出身,因为工作原因长期关注科技行业,后来凭着热爱和出色的口才转行做了全职内容创作者。擅长从普通用户视角出发,把复杂的技术概念讲得生动有趣,是她发掘了老王,并把他‘骗’来一起做播客的‘始作俑者’。
voice: Autonoe
- name: 老王
role: >-
一位资深科技评论员,互联网老兵。亲身经历过中国互联网从草莽到巨头的全过程,当过程序员,做过产品经理,也创过业。因此他对行业的各种‘风口’和‘概念’有自己独到的、甚至有些刻薄的见解。观点犀利,一针见血,说话直接,热衷于给身边的一切产品挑刺。被‘忽悠’上了‘贼船’,表面上经常吐槽,但内心很享受这种分享观点的感觉。
voice: Puck
label: podcast_url
```
配置完成后Zenfeed 将在每次抓取到新文章时,自动执行上述流程。可以在通知模版中使用 podcast_url label或 Web 中直接收听Web 固定读取 podcast_url label若使用别的名称则无法读取

179
docs/query-api-zh.md Normal file
View File

@@ -0,0 +1,179 @@
# Zenfeed Query API 使用教程
Zenfeed Query API 允许用户通过多种条件检索存储的 Feed 数据。本教程将详细介绍如何使用此 API。
## 接口说明
### 请求
* **方法**: `POST`
* **URL**: `/query`
* **Content-Type**: `application/json`
### 请求体 (JSON)
```json
{
"query": "string",
"threshold": 0.55,
"label_filters": ["string"],
"summarize": false,
"limit": 10,
"start": "2006-01-02T15:04:05Z07:00",
"end": "2006-01-02T15:04:05Z07:00"
}
```
**字段说明:**
* `query` (string, 可选):
* 用于语义搜索的查询字符串。
* 如果提供,必须至少包含 5 个字符。
* 如果为空或未提供,则不进行语义搜索,仅根据其他条件(如标签、时间)过滤。
* `threshold` (float32, 可选, 默认值: `0.55`):
* 语义搜索的相关性阈值。
* 取值范围: `[0, 1]`
* 仅当 `query` 字段非空时有效。
* `label_filters` ([]string, 可选):
* 一个字符串数组,用于根据 Feed 的标签进行过滤。
* 每个过滤器的格式为:
* `"key=value"`: 匹配标签 `key` 的值为 `value` 的 Feed。
* `"key!=value"`: 匹配标签 `key` 的值不为 `value` 的 Feed。
* 常用的 `key` 包括:
* `source`: Feed 来源
* `title`: Feed 标题
* `你在 rewrite 阶段自定义创建的`:比如 category
* 可以指定多个过滤器,它们之间是 "AND" 关系。
* `summarize` (bool, 可选, 默认值: `false`):
* 是否对查询结果进行摘要。
* 如果为 `true`,系统将调用配置的 LLM (Large Language Model) 对返回的 Feed 内容进行总结。
* `limit` (int, 可选, 默认值: `10`):
* 返回 Feed 结果的最大数量。
* 取值范围: `[1, 500]`
* `start` (string, 可选, 默认值: 24小时前):
* 查询的时间范围的开始时间(包含)。
* 格式为 RFC3339 (例如: `"2023-10-26T10:00:00Z"`)。
* `end` (string, 可选, 默认值: 当前时间):
* 查询的时间范围的结束时间(不包含)。
* 格式为 RFC3339 (例如: `"2023-10-27T10:00:00Z"`)。
* `end` 时间必须晚于 `start` 时间。
### 响应体 (JSON)
```json
{
"summary": "string",
"feeds": [
{
"labels": {
"type": "rss",
"source": "Example News",
"title": "Breaking News: AI Revolutionizes Everything",
"link": "http://example.com/news/123",
"pub_time": "2023-10-26T09:30:00Z",
"content": "Detailed content of the news article..."
},
"time": "2023-10-26T10:15:30+08:00",
"score": 0.85
}
],
"count": 1
}
```
**字段说明:**
* `summary` (string, 可选):
* 如果请求中的 `summarize``true` 且成功生成摘要,此字段将包含 LLM 生成的内容摘要。
* 如果生成摘要失败,可能包含错误信息。
* `feeds` ([]object, 必须):
* 一个对象数组,每个对象代表一个符合查询条件的 Feed。
* **Feed 对象结构**:
* `labels` (object): Feed 的元数据标签,键值对形式。
* `type` (string): Feed 类型。
* `source` (string): Feed 来源。
* `title` (string): Feed 标题。
* `link` (string): Feed 原始链接。
* `pub_time` (string): Feed 发布时间。
* `content` (string): Feed 内容。
* ... (其他自定义标签)
* `time` (string): Feed 被系统记录或处理的时间戳 (RFC3339 格式,通常为服务器本地时区)。
* `score` (float32, 可选):
* 当请求中提供了 `query` (进行了语义搜索) 时,此字段表示该 Feed 与查询的相关性得分。
* 得分越高,相关性越强。
* `count` (int, 必须):
* 返回的 `feeds` 数组中的 Feed 数量。
## `curl` 示例
以下示例假设 Zenfeed 服务运行在 `http://localhost:1300`
### 1. 基本查询 (获取最近10条记录)
获取最近默认24小时内的最多10条 Feed。
```bash
curl -X POST http://localhost:1300/query \
-H "Content-Type: application/json" \
-d '{}'
```
### 2. 语义搜索
查询与 "人工智能最新进展" 相关的 Feed并设置相关性阈值为 `0.7`
```bash
curl -X POST http://localhost:1300/query \
-H "Content-Type: application/json" \
-d '{
"query": "人工智能最新进展",
"threshold": 0.7
}'
```
### 3. 带标签过滤的查询
查询类型为 "rss" 且来源不是 "SpecificSource" 的 Feed。
```bash
curl -X POST http://localhost:1300/query \
-H "Content-Type: application/json" \
-d '{
"label_filters": [
"type=rss",
"source!=SpecificSource"
]
}'
```
### 4. 带时间范围的查询
查询 2023年10月25日 00:00:00 UTC 到 2023年10月26日 00:00:00 UTC 之间的 Feed。
```bash
curl -X POST http://localhost:1300/query \
-H "Content-Type: application/json" \
-d '{
"start": "2023-10-25T00:00:00Z",
"end": "2023-10-26T00:00:00Z"
}'
```
### 5. 组合查询示例
查询过去3天内与 "开源项目" 相关的 Feed类型为 "github_release"并获取摘要最多返回20条。
```bash
# 假设今天是 2023-10-28
curl -X POST http://localhost:1300/query \
-H "Content-Type: application/json" \
-d '{
"query": "最近的热门开源项目", # 尽可能详细,获得最佳搜索效果
"threshold": 0.6,
"label_filters": ["source=github_trending"],
"summarize": true,
"limit": 20,
"start": "2023-10-25T00:00:00Z", # 手动计算或动态生成
"end": "2023-10-28T00:00:00Z" # 手动计算或动态生成
}'
```

33
docs/roadmap-zh.md Normal file
View File

@@ -0,0 +1,33 @@
## 短期
* 播客
* NotebookLM 的播客效果让人惊艳
* 技术上复刻一个并不难,难的是没有又便宜效果又好的 TTS API只用得起小帅的声音😭
* TTS 音色进步也只是近几年的事情,长期需要等成本下降
* 短期因为我个人很喜欢播客总结(应该也很适合大家通勤),会先本地部署模型,提供给 https://zenfeed.xyz 使用
* epub2rss
* 见过 rss2epub但你绝没见过反着来的
* 严格上这并不属于 zenfeed顶多算生态项目吧
* 抛开时效性,书比新闻更有价值。但当你立下 “坚持阅读” 的 flag然后呢
* 这个子项目旨在实现:每日更新一章,作为 rss 暴露。在阅读新闻 RSS 时,“顺便” 把书给看了
* 这里遵循《掌控习惯》的几个原理
* 让它显而易见:在你的新闻阅读器里
* 让它简便易行:配合 zenfeed 总结,更轻松地阅读要点(进一步了解原文逃不掉,但这时你已经被勾住了,相信这事已经没那么困难了)
* 让你感觉到爽zenfeed 阅读完后的木鱼声,嗯这算一个,确信
* 提供更多玩法指导
* zenfeed 定位是信息管理引擎,普通用户反而搞不清楚状况
* 短期并不会考虑做一个没有使用心智成本的 “产品”,但我可以分享一些垂直的使用案例技巧
> 灵光一现:最近喜欢上和豆包聊新闻了,或许可以分享下如何把 zenfeed 数据接入豆包
## 中长期
* 更易用的 Web但坦诚地讲目前优先级比较低更鼓励调用后端 api构建一个属于你的 web
* 主题研究报告
* 屏蔽 or follow 相关新闻后续
* 相关性聚合阅读
![](images/web-reading-aggr.png)
> P.S. 相关功能已经实现,只不过没有下放到 Web
---
如果你觉得 zenfeed 很酷,并且有意愿贡献,请联系我!

59
docs/rss-api-zh.md Normal file
View File

@@ -0,0 +1,59 @@
# 托管源
## Folo
直接搜索 zenfeed
## Other
```bash
https://zenfeed.xyz/rss?.... 参数用法见下方《自部署》
https://zenfeed.xyz/rss?label_filter=source=知乎热榜 # 你在 zenfeed.xyz 中看到的源名称
https://zenfeed.xyz/rss?query=AI # 语义搜索。请不要滥用,成本 cover 不住可能随时下线
```
# 自部署
## 1. 配置(可选)
```yaml
api:
rss:
content_html_template: | # 可自由排版搭配go template 语法);需要确保渲染后的内容是正确的 HTML
{{ .summary_html_snippet }} # 默认值
```
## 2. enjoy RSS address!
```bash
your_zenfeed_address/rss?label_filter=label1=value1&label_filter=label2!=value2&query=xxx
# e.g.
## Past 24h rss feed for GithubTrending
http://localhost:1302/rss?label_filter=source=GithubTrending
## Past 24h rss feed for Tech category
http://localhost:1302/rss?label_filter=category=Tech
## Past 24h rss feed for dynamic query
http://localhost:1302/rss?query=特朗普最新消息
```
# FAQ
## 添加失败怎么办?
部分 RSS 阅读器通过服务端间接访问 RSS 地址,如果 zenfeed 部署到本地,将无法访问
你需要通过内网穿透,或者 VPS 暴露到公网上,注意仅暴露 1302 端口
## Folo 看起来只有纯文本?
![](images/folo-html.png)
## 暗黑模式显示有问题?
嗯就是有问题,请使用白底背景,否则样式渲染会出现问题

155
docs/tech/hld-zh.md Normal file
View File

@@ -0,0 +1,155 @@
> 适用版本v0.2.2
```mermaid
graph TD
subgraph User_Interactions
WebUI["Web UI (zenfeed-web)"]
MCPClient["MCP Client"]
end
subgraph Zenfeed_Core_Services
HTTPServer["HTTP Server (pkg/api/http)"]
MCPServer["MCP Server (pkg/api/mcp)"]
API["API Service (pkg/api)"]
end
subgraph Data_Processing_Storage_Main
ScraperManager["Scraper Manager (pkg/scrape)"]
Rewriter["Rewriter (pkg/rewrite)"]
FeedStorage["Feed Storage (pkg/storage/feed)"]
LLMFactory["LLM Factory (pkg/llm)"]
KVStorage["KV Storage (pkg/storage/kv)"]
end
subgraph FeedStorage_Internals
Block["Block (pkg/storage/feed/block)"]
ChunkFile["ChunkFile (pkg/storage/feed/block/chunk)"]
PrimaryIndex["Primary Index (pkg/storage/feed/block/index/primary)"]
InvertedIndex["Inverted Index (pkg/storage/feed/block/index/inverted)"]
VectorIndex["Vector Index (pkg/storage/feed/block/index/vector)"]
end
subgraph Scheduling_Notification
Scheduler["Scheduler (pkg/schedule)"]
Notifier["Notifier (pkg/notify)"]
NotifyChan["(Go Channel for Results)"]
EmailChannel["Email Channel (pkg/notify/channel)"]
end
ConfigManager["Config Manager (pkg/config)"]
ExternalDataSources["External Data Sources (RSS Feeds, RSSHub)"]
LLMProviders["LLM Providers (OpenAI, Gemini, etc.)"]
EmailServiceProvider["Email Service Provider (SMTP)"]
WebUI --> HTTPServer
MCPClient --> MCPServer
HTTPServer --> API
MCPServer --> API
API --> ConfigManager
API --> FeedStorage
API --> LLMFactory
ScraperManager --> ExternalDataSources
ScraperManager --> KVStorage
ScraperManager --> FeedStorage
FeedStorage --> Rewriter
FeedStorage --> LLMFactory
FeedStorage --> KVStorage
FeedStorage --> Block
Block --> ChunkFile
Block --> PrimaryIndex
Block --> InvertedIndex
Block --> VectorIndex
Rewriter --> LLMFactory
Scheduler --> FeedStorage
Scheduler --> NotifyChan
Notifier --> NotifyChan
Notifier --> LLMFactory
Notifier --> EmailChannel
Notifier --> KVStorage
EmailChannel --> EmailServiceProvider
ConfigManager --> HTTPServer
ConfigManager --> MCPServer
ConfigManager --> API
ConfigManager --> ScraperManager
ConfigManager --> Rewriter
ConfigManager --> FeedStorage
ConfigManager --> LLMFactory
ConfigManager --> Scheduler
ConfigManager --> Notifier
LLMFactory --> LLMProviders
LLMFactory --> KVStorage
```
## 技术特点
* 零外部依赖
* Golang 资源占用少于采用 Python 的竞品
* 采用模块化、面向服务的架构,各组件职责清晰
* 系统配置集中管理,并支持热重载,实现动态调整
* 提供灵活的内容重写管道,可自定义处理流程
* Feed 数据按时间分块存储,支持高效索引与生命周期管理
* 支持基于向量嵌入的语义搜索能力
* 通过可配置的抓取器和 RSSHub 集成,支持多样化的数据源
* 基于规则的调度引擎,实现灵活的事件监控与查询
* 可定制的通知路由和多渠道通知发送机制
* 实现 MCP (Model Context Protocol) 服务端,便于外部工具集成
* 提供统一的 API 接口层,解耦核心业务与通信协议
* 内置通用键值存储,用于缓存和持久化辅助状态
## 组件说明
1. **配置管理器 (ConfigManager - `pkg/config.Manager`)**
* 负责加载、管理和热更新应用的整体配置 (通常存储在 `config.yaml` 中)。其他组件订阅配置变更,以便动态调整其行为。是系统动态性的基础。
2. **键值存储 (KVStorage - `pkg/storage/kv.Storage`)**
* 提供一个通用的键值存储服务。用于存储临时状态、缓存(如 LLM 调用、RSSHub 响应)、小型元数据、以及一些组件的运行状态(如 Scraper 的最后抓取时间、Notifier 的通知发送记录)。
3. **大语言模型工厂 (LLMFactory - `pkg/llm.Factory`)**
* 管理和提供大语言模型 (LLM) 的实例。它根据配置初始化不同的 LLM 客户端 (如 OpenAI, Gemini, SiliconFlow 等),并向上层组件 (如 `Rewriter`, `FeedStorage`, `Notifier`) 提供统一的 LLM 调用接口。这些接口用于文本生成、内容摘要、向量嵌入等 AI 处理任务。,可以动态切换或更新 LLM 配置。
4. **内容重写器 (Rewriter - `pkg/rewrite.Rewriter`)**
* 根据用户在配置文件中定义的重写规则 (Rewrite Rules),对原始 Feed 内容进行管道式处理。每个规则可以针对 Feed 的特定标签 (如标题、正文),通过调用 `LLMFactory` 提供的模型执行操作 (如评分、分类、摘要、过滤、添加新标签等)。处理后的 Feed 用于存储或进一步的逻辑判断。
5. **Feed 存储 (FeedStorage - `pkg/storage/feed.Storage`)**
* 负责持久化存储经过 `Rewriter` 处理后的 Feed 数据,并提供高效的查询接口。它管理着 Feed 数据的生命周期和存储结构。
* **关键子组件**:
* **Block (`pkg/storage/feed/block.Block`)**: `FeedStorage` 将数据按时间组织成多个 `Block`。每个 `Block` 代表一个时间段内的数据 (例如,过去 25 小时)。这种设计有助于数据的管理,如按时间归档、删除过期数据,并能独立处理冷热数据。
* **ChunkFile (`pkg/storage/feed/block/chunk.File`)**: 在每个 `Block` 内部,实际的 Feed 内容(经过序列化,包含所有标签和时间戳)存储在 `ChunkFile` 中。这是一种紧凑的存储方式,支持高效的追加和按偏移读取。
* **Primary Index (`pkg/storage/feed/block/index/primary.Index`)**: 为每个 `Block` 内的 Feed 提供主键索引。它将全局唯一的 Feed ID 映射到该 Feed 在对应 `ChunkFile` 中的具体位置(如偏移量),实现通过 ID 快速定位 Feed 数据。
* **Inverted Index (`pkg/storage/feed/block/index/inverted.Index`)**: 为每个 `Block` 内的 Feed 标签建立倒排索引。它将标签的键值对映射到包含这些标签的 Feed ID 列表,从而能够根据标签条件快速过滤 Feed。
* **Vector Index (`pkg/storage/feed/block/index/vector.Index`)**: 为每个 `Block` 内的 Feed或其内容切片存储由 `LLMFactory` 生成的向量嵌入。它支持高效的近似最近邻搜索,从而实现基于语义相似度的 Feed 查询。
6. **API 服务 (API - `pkg/api.API`)**
* 提供核心的业务逻辑接口层,供上层服务 (如 `HTTPServer`, `MCPServer`) 调用解耦核心业务逻辑与具体的通信协议。接口功能包括应用配置的查询与动态应用、RSSHub 相关信息的查询、Feed 数据的写入与多维度查询等。此组件会响应配置变更,并将其传递给其依赖的下游组件。
7. **HTTP 服务 (HTTPServer - `pkg/api/http.Server`)**
* 暴露一个 HTTP/JSON API 接口,主要供 Web 前端 (`zenfeed-web`) 或其他HTTP客户端使用。用户通过此接口进行如添加订阅源、配置监控规则、查看 Feed 列表、管理应用配置等操作。它依赖 `API` 组件来执行实际的业务逻辑。
8. **MCP 服务 (MCPServer - `pkg/api/mcp.Server`)**
* 实现 Model Context Protocol (MCP) 服务端。这使得 Zenfeed 的数据可以作为上下文源被外部应用或 LLM 集成。
9. **抓取管理器 (ScraperManager - `pkg/scrape.Manager`)**
* 负责管理和执行从各种外部数据源 (主要是 RSS Feed支持通过 RSSHub 扩展源) 抓取内容的任务。它根据配置中定义的来源和抓取间隔,定期或按需从指定的 URL 或 RSSHub 路由抓取最新的 Feed 数据。抓取到的原始数据会提交给 `FeedStorage` 进行后续的重写处理和存储。
* **关键子组件**:
* **Scraper (`pkg/scrape/scraper.Scraper`)**: 每个配置的数据源会对应一个 `Scraper` 实例,负责该特定源的抓取逻辑和调度。
* **Reader (`pkg/scrape/scraper/source.go#reader`)**: `Scraper` 内部使用不同类型的 `reader` (如针对标准 RSS URL 的 reader针对 RSSHub 路径的 reader) 来实际获取数据。
10. **调度器 (Scheduler - `pkg/schedule.Scheduler`)**
* 根据用户配置的调度规则 (Scheduls Rules) 定期执行查询任务。这些规则定义了特定的查询条件,如语义关键词 (基于向量搜索)、标签过滤、以及时间范围等。当 `FeedStorage` 中有符合规则条件的 Feed 数据时,调度器会将这些结果 (封装为 `rule.Result`) 通过一个内部 Go Channel (`notifyChan`) 发送给 `Notifier` 组件进行后续处理。
* **关键子组件**:
* **Rule (`pkg/schedule/rule.Rule`)**: 每个调度配置对应一个 `Rule` 实例,封装了该规则的查询逻辑和执行计划。
11. **通知器 (Notifier - `pkg/notify.Notifier`)**
* 监听来自 `Scheduler``notifyChan`。接收到 `rule.Result` 后,它会根据通知路由 (NotifyRoute) 配置对 Feed 进行分组、聚合。为了生成更精炼的通知内容,它可能会再次调用 `LLMFactory` 进行摘要。最终,通过配置的通知渠道 (NotifyChannels) 将处理后的信息发送给指定的接收者 (NotifyReceivers)。其发送状态或去重逻辑可能利用 `KVStorage`
* **关键子组件**:
* **Router (`pkg/notify/route.Router`)**: 根据配置的路由规则,将 `rule.Result` 中的 Feed 分配到不同的处理流程或目标接收者。
* **Channel (`pkg/notify/channel.Channel`)**: 代表具体的通知发送方式,例如 `EmailChannel` 负责通过 SMTP 发送邮件。

109
docs/tech/rewrite-zh.md Normal file
View File

@@ -0,0 +1,109 @@
> 适用版本v0.2.2
`rewrite` 组件是 zenfeed 中负责对信息流内容进行动态处理和转换的核心模块。它允许用户通过声明式的规则配置,利用大型语言模型 (LLM) 等工具,对内容的元数据(标签)进行修改、丰富、过滤,甚至决定是否丢弃某条信息。
## 1. 设计理念与哲学
* **Prometheus 的 `relabel_config`**: 借鉴其强大的标签重写能力。在 Prometheus 中,`relabel_config` 允许用户在采集指标前后动态地修改标签集,实现服务发现、指标过滤和路由等高级功能。`rewrite` 组件将此思想应用于信息流处理,将每一条信息(如一篇文章、一个帖子)视为一个标签集,通过规则来操作这些标签。
* **管道 (Pipeline) 处理模式**: 信息的处理过程被设计成一个可配置的 ETL 管道。每个规则是管道中的一个处理阶段,信息流经这些规则,逐步被转换和打标。这种模式使得复杂的处理逻辑可以被分解为一系列简单、独立的步骤,易于理解和维护。
* **AI 能力的模块化与按需应用**: 大型语言模型 (LLM) 被视为一种强大的"转换函数"。用户可以根据需求,在规则中指定使用哪个 LLM、配合什么样的提示词 (Prompt) 来处理特定的文本内容(例如,从文章正文生成摘要、分类、评分等)。这种设计使得 AI 能力可以灵活地嵌入到信息处理的任意环节。
* **内容即标签 (Content as Labels)**: 这是 zenfeed 的一个核心抽象。原始信息(如标题、正文、链接、发布时间)和经过 AI 或规则处理后产生的衍生信息(如类别、标签、评分、摘要)都被统一表示为键值对形式的"标签"。这种统一表示简化了后续的查询、过滤、路由和展示逻辑。
* **声明式配置优于命令式代码**: 用户通过 YAML 配置文件定义重写规则,而不是编写代码来实现处理逻辑。这降低了使用门槛,使得非程序员也能方便地定制自己的信息处理流程,同时也使得配置更易于管理和版本控制。
> 简单说这是一条专门针对 Feed 处理的可配置工作流
## 2. 业务流程
内容重写组件的核心工作流程是接收一个代表信息单元的标签集 (`model.Labels`),然后按顺序应用预定义的重写规则 (`Rule`),最终输出一个经过修改的标签集,或者指示该信息单元应被丢弃。
其处理流程可以概括为:
1. **接收标签集**: 组件的入口是一个 `model.Labels` 对象,代表待处理的信息单元。
2. **顺序应用规则**: 系统会遍历用户配置的每一条 `Rule`
3. **规则评估与执行**: 对于每一条规则,系统会:
* **定位源文本**: 根据规则指定的 `source_label` (默认为 `content`),找到相应的文本内容。
* **条件检查**: 检查源文本是否满足规则中声明的 `skip_too_short_threshold`最小长度默认为300字符。若不满足则跳过当前规则。
* **文本转换 (可选)**: 若规则声明了 `transform` (如通过 `to_text` 使用 LLM 和特定 `Prompt` 进行处理),则源文本会被转换为新文本。此转换结果将用于后续的匹配。
* **模式匹配**: 使用规则中声明的 `match` 正则表达式 (默认为 `.*`) 来匹配(可能已被转换过的)文本。若不匹配,则跳过当前规则。
* **执行动作**: 若文本匹配成功,则执行规则声明的 `Action`
* `ActionDropFeed`: 指示应丢弃当前信息单元,处理流程终止。
* `ActionCreateOrUpdateLabel`: 使用(可能已被转换过的)匹配文本,为规则中指定的 `Label` 创建或更新标签值。
4. **输出结果**:
* 若所有规则处理完毕且未触发 `ActionDropFeed`,则返回最终修改并排序后的 `model.Labels`
* 若任一规则触发 `ActionDropFeed`,则返回 `nil`,表示丢弃。
* 处理过程中若发生错误(如 LLM 调用失败),则会中止并返回错误。
## 3. 使用示例
以下是一些如何使用 `rewrite` 规则的场景示例:
### 示例 1: 内容分类打标
* **目标**: 根据文章内容,自动为其添加一个 `category` 标签,如 "Technology", "Finance" 等。
* **规则配置 (概念性)**:
```yaml
- source_label: "content" # 使用文章正文作为分析源
transform:
to_text:
llm: "qwen-default" # 使用名为 "qwen-default" 的 LLM 配置
prompt: "{{ .category }} 可以接着补充你额外的要求" # 使用预设的 "category" prompt 模板
match: ".+" # 匹配 LLM 返回的任何非空分类结果
action: "create_or_update_label"
label: "category" # 新标签的键为 "category"
```
* **效果**: 如果一篇文章内容是关于人工智能的LLM 可能会返回 "Technology"。经过此规则处理后,文章的标签集会增加或更新一个标签,例如 `{"category", "Technology"}`。**后续可用于,“查询分类为 Technology 的文章”,“基于分类为 Technology 的文章发送每日科技日报”...**
### 示例 2: 基于 LLM 评分过滤低质量内容
* **目标**: 让 LLM 对文章内容进行评分 (0-10),如果评分低于 4则丢弃该文章。
* **规则配置 (包含两条规则)**:
* **规则 2.1: 内容评分**
```yaml
- source_label: "content"
transform:
to_text:
llm: "qwen-default"
prompt: "{{ .score }} 可以接着补充你额外的要求" # 使用预设的 "score" prompt 模板
match: "^([0-9]|10)$" # 确保 LLM 返回的是 0-10 的数字
action: "create_or_update_label"
label: "ai_score" # 将评分结果存入 "ai_score" 标签
```
* **规则 2.2: 根据评分过滤**
```yaml
- source_label: "ai_score" # 使用上一条规则生成的评分作为判断依据
# 无需 Transform
match: "^[0-3]$" # 匹配 0, 1, 2, 3 分
action: "drop_feed" # 丢弃这些低分文章
```
* **效果**: 文章首先会被 LLM 评分并打上 `ai_score` 标签。如果该评分值在 0 到 3 之间,第二条规则会将其丢弃。
### 示例 3: 基于特定标签值添加新标签
* **目标**: 如果文章的 `source` 标签值是 "Hacker News",则添加一个新标签 `source_type: "community"`。
* **注意**: 当前 `ActionCreateOrUpdateLabel` 会将匹配成功的 `text` (即 `source_label` 的值或其转换结果)作为新标签的值。若要实现固定值标签,需要通过 LLM 转换。
* **规则配置 (通过 LLM 实现映射)**:
```yaml
- source_label: "source" # 源标签是 "source"
transform:
to_text:
llm: "qwen-mini"
# Prompt 需要精心设计,告诉 LLM 如何根据输入映射到输出
# 例如Prompt 可以包含类似 "If input is 'Hacker News', output 'community'. If input is 'GitHub Trending', output 'code'." 的逻辑
prompt: |
Analyze the input, which is a news source name.
If the source is "Hacker News", output "community".
If the source is "GitHub Trending", output "code".
If the source is "V2EX", output "community".
Otherwise, output "unknown".
Return ONLY the type, no other text.
match: "^(community|code|unknown)$" # 确保 LLM 输出的是预期的类型
action: "create_or_update_label"
label: "source_type" # 新标签的键
```
* **效果**: 如果某文章的 `source` 标签值为 "Hacker News",经过 LLM 处理后(理想情况下)会输出 "community",然后 `source_type` 标签会被设置为 `{"source_type", "community"}`。
这些示例展示了 `rewrite` 组件的灵活性和强大功能,通过组合不同的源标签、转换、匹配条件和动作,可以实现复杂的内容处理和信息增强逻辑。

160
docs/tech/testing-zh.md Normal file
View File

@@ -0,0 +1,160 @@
# Zenfeed 最新测试策略与风格
> 适用版本v0.2.2
## 1. 引言
Zenfeed 的测试策略核心目标是:
* **清晰性 (Clarity)**:测试本身应如文档般易于理解,清晰地表达被测功能的行为和预期。
* **可信性 (Reliability)**:测试结果应准确反映代码的健康状况,确保每次提交的信心。
* **可维护性 (Maintainability)**:测试代码应易于修改和扩展,以适应项目的持续演进。
本指南旨在详细介绍 Zenfeed 项目所遵循的测试理念、风格和具体实践。
## 2. 核心测试理念与风格
Zenfeed 的测试方法论深受行为驱动开发 (BDD) 的影响,并结合了表驱动测试等高效实践。
### 2.1 行为驱动开发
我们选择 BDD 作为核心的测试描述框架,主要基于以下原因(其理念也体现在 `pkg/test/test.go``Case` 结构设计中):
* **提升可读性 (Enhanced Readability)**BDD 强调使用自然语言描述软件的行为。每个测试用例读起来都像一个用户故事或一个功能说明,这使得测试本身就成为了一种精确的"活文档"。
* **关注行为 (Focus on Behavior)**:测试不再仅仅是验证代码片段的输入输出,而是从模块、组件或用户交互的层面描述其应有的行为。这有助于确保我们构建的功能符合预期。
* **需求驱动 (Requirement-Driven)**:测试直接对应需求描述,而非实现细节。这种自顶向下的方法确保了测试的稳定性,即使内部实现重构,只要行为不变,测试依然有效。
BDD 通常使用 `Scenario`, `Given`, `When`, `Then` 的结构来组织测试:
* **`Scenario` (场景)**:描述测试用例所针对的特性或功能点。
* 例如:`"Query hot block with label filters"` (查询带标签过滤的热数据块)
* **`Given` (给定)**:描述场景开始前的初始上下文或状态(**注意:这不是指方法的输入参数**)。
* 例如:`"a hot block with indexed feeds"` (一个已索引了 Feed 的热数据块)
* **`When` (当)**:描述触发场景的事件或操作(**这部分通常包含被测方法的输入参数**)。
* 例如:`"querying with label filters"` (当使用标签过滤器进行查询时)
* **`Then` (那么)**:描述场景结束后预期的结果或状态变化。
* 例如:`"should return matching feeds"` (那么应该返回匹配的 Feed)
为了更好地在代码中实践 BDD我们定义了 `pkg/test/test.go` 中的 `Case[GivenDetail, WhenDetail, ThenExpected]` 泛型结构。其中:
* `GivenDetail`: 存储 `Given` 子句描述的初始状态的具体数据。
* `WhenDetail`: 存储 `When` 子句描述的事件或方法调用的具体参数。
* `ThenExpected`: 存储 `Then` 子句描述的预期结果。
这种结构化不仅增强了测试数据的类型安全,也使得测试用例的意图更加明确。对于需要模拟依赖项的组件,`GivenDetail` 通常会包含用于配置这些模拟行为的 `component.MockOption`,我们将在后续 Mocking 章节详细讨论。
### 2.2 表驱动测试
当一个功能或方法需要针对多种不同的输入组合、边界条件或状态进行测试时,表驱动测试是一种非常高效和整洁的组织方式。
* **简洁性 (Conciseness)**:将所有测试用例的数据(输入、参数、预期输出)集中定义在一个表格(通常是切片)中,避免了为每个 case编写大量重复的测试逻辑。
* **易扩展性 (Extensibility)**:添加新的测试场景变得非常简单,只需在表格中增加一条新记录即可。
* **清晰性 (Clarity)**:所有相关的测试用例一目了然,便于快速理解被测功能的覆盖范围。
**实践约定**
在 Zenfeed 中,**当存在多个测试用例时,必须使用表驱动测试**。
### 2.3 测试结构约定
为了保持项目范围内测试代码的一致性和可读性,我们约定在测试文件中遵循以下组织结构:
1. **定义辅助类型 (Define Helper Types)**:在测试函数的开头部分,通常会为 `GivenDetail`, `WhenDetail`, `ThenExpected` 定义具体的结构体类型,以增强类型安全和表达力。
2. **定义测试用例表 (Define Test Case Table)**:将所有测试用例集中定义在一个 `[]test.Case` 类型的切片中。
3. **循环执行测试 (Loop Through Test Cases)**:使用 `for` 循环遍历测试用例表,并为每个用例运行 `t.Run(tt.Scenario, func(t *testing.T) { ... })`
4. **清晰的 G/W/T 逻辑块 (Clear G/W/T Blocks)**:在每个 `t.Run` 的匿名函数内部,根据需要组织代码块,以对应 `Given`(准备初始状态,通常基于 `tt.GivenDetail``When`(执行被测操作,通常使用 `tt.WhenDetail`),和 `Then`(断言结果,通常对比 `tt.ThenExpected`)。
5. **描述性变量名 (Descriptive Variable Names)**:使用与 BDD 术语(如 `given`, `when`, `then`, `expected`, `actual`)相匹配或能清晰表达意图的变量名。
## 3. 依赖隔离Mocking (Dependency Isolation: Mocking)
单元测试的核心原则之一是**隔离性 (Isolation)**即被测试的代码单元如一个函数或一个方法应该与其依赖项隔离开来。Mocking (模拟) 是实现这种隔离的关键技术。
我们主要使用 `github.com/stretchr/testify/mock` 库来实现 Mocking。特别是对于实现了 `pkg/component/component.go``Component` 接口的组件,我们提供了一种标准的 Mocking 方式。
```go
type givenDetail struct {
// Example of another initial state field for the component being tested
initialProcessingPrefix string
// MockOption to set up the behavior of dependencyA
dependencyAMockSetup component.MockOption
// ...
}
type whenDetail struct {
processDataInput string
// ...
}
type thenExpected struct {
expectedOutput string
expectedError error
// ...
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Component processes data successfully with mocked dependency",
Given: "YourComponent with an initial prefix and dependencyA mocked to return 'related_data_value' for 'input_key'",
When: "ProcessData is called with 'input_key'",
Then: "Should return 'prefix:input_key:related_data_value' and no error",
GivenDetail: givenDetail{
initialProcessingPrefix: "prefix1",
dependencyAMockSetup: func(m *mock.Mock) {
// We expect DependencyA's FetchRelatedData to be called with "input_key"
// and it should return "related_data_value" and no error.
m.On("FetchRelatedData", "input_key").
Return("related_data_value", nil).
Once() // Expect it to be called exactly once.
},
},
WhenDetail: whenDetail{
processDataInput: "input_key",
},
ThenExpected: thenExpected{
expectedOutput: "prefix1:input_key:related_data_value",
expectedError: nil,
},
},
// ...更多测试用例...
}
// 在 for _, tt := range tests { t.Run(tt.Scenario, func(t *testing.T) { ... }) } 循环内部
// Given 阶段: Setup mocks and the component under test
var mockHelperForDepA *mock.Mock
defer func() { // 确保在每个子测试结束时断言
if mockHelperForDepA != nil {
mockHelperForDepA.AssertExpectations(t)
}
}()
// 创建并配置 mockDependencyA
// dependency_a_pkg.NewFactory 应该是一个返回 DependencyA 接口和 error 的工厂函数
// 它接受 component.MockOption 来配置其内部的 mock.Mock 对象
mockDependencyA, err := dependency_a_pkg.NewFactory(
component.MockOption(func(m *mock.Mock) {
mockHelperForDepA = m // 保存 mock.Mock 实例以供 AssertExpectations 使用
if tt.GivenDetail.dependencyAMockSetup != nil {
// 应用测试用例中定义的 specific mock setup
tt.GivenDetail.dependencyAMockSetup(m)
}
}),
).New("mocked_dep_a_instance", nil /* config for dep A */, dependency_a_pkg.Dependencies{})
Expect(err).NotTo(HaveOccurred())
Expect(mockDependencyA).NotTo(BeNil())
// 假设 YourComponent 的构造函数如下:
componentUnderTest := NewYourComponent(tt.GivenDetail.initialProcessingPrefix, mockDependencyA)
// When 阶段: Execute the action being tested
actualOutput, actualErr := componentUnderTest.ProcessData(context.Background(), tt.WhenDetail.processDataInput)
// Then 阶段: Assert the outcomes
if tt.ThenExpected.expectedError != nil {
Expect(actualErr).To(HaveOccurred())
Expect(actualErr.Error()).To(Equal(tt.ThenExpected.expectedError.Error()))
} else {
Expect(actualErr).NotTo(HaveOccurred())
}
Expect(actualOutput).To(Equal(tt.ThenExpected.expectedOutput))
```

102
docs/tech/vector-zh.md Normal file
View File

@@ -0,0 +1,102 @@
> 适用版本v0.2.2
## 1. 引言
`vector.Index` 组件是 Zenfeed 系统中负责实现内容语义相似度检索的核心模块,与 `block.Block` 一一对应。它的主要目标是根据用户提供的查询向量,快速找到与之在语义上最相关的 Feed通常是新闻资讯、文章等文本内容
该索引的核心设计理念是服务于**文档级别的召回 (Document-level Recall)**。与许多传统向量索引将每个文本块chunk视为独立节点不同`vector.Index` 将**整个 Feed 文档作为图中的一个节点**。而 Feed 内容经过 `embedding_spliter` 切分后产生的多个文本块chunks它们各自的向量嵌入embeddings则作为该 Feed 节点的属性。
这种设计的独特性在于:
* **搜索结果直接是 Feed ID**:用户搜索后直接获得相关 Feed 的标识符,而不是零散的文本片段。
* **相似度聚焦于“任何部分相关即相关”**:如果一个 Feed 的任何一个 chunk 与查询向量高度相似,整个 Feed 就被认为是相关的。其最终得分为该 Feed 所有 chunks 与查询向量相似度中的最大值。
* **为新闻资讯场景优化**:这种设计特别适合新闻资讯类应用,优先保证相关内容的召回率,确保用户不会错过重要信息,即使该信息仅是文章的一部分。
`vector.Index` 底层采用 HNSW (Hierarchical Navigable Small World) 算法来组织和搜索这些 Feed 节点,以实现高效的近似最近邻查找。
## 2. 核心概念
理解 `vector.Index` 的运作方式,需要熟悉以下核心概念:
* **Feed (Node)**:
*`vector.Index` 的 HNSW 图中,每个**节点 (node)** 代表一个独立的 **Feed 文档** (例如一篇新闻报道)。
* 每个 Feed 通过一个唯一的 `uint64` ID 来标识。
* 节点存储了其对应的原始 Feed ID 以及与该 Feed 相关的多个向量。
* **Chunk (Vector Represented by `[][]float32`)**:
* 一个 Feed 的内容(尤其是其文本标签,如标题、正文)可能较长。如果直接将整个长文本生成单一的 embedding可能会遇到以下问题
* **LLM 输入长度限制**: 许多 embedding 模型对输入文本的长度有限制。
* **语义稀释 (Semantic Dilution)**: 对于包含多个主题或信息点的长文本,单一向量可能难以精确捕捉所有细微的语义,导致关键信息在整体平均化的向量表示中被“稀释”,降低了特定语义片段的表征能力。例如,一篇包含多个不同事件的综合报道,其单一向量可能无法很好地代表其中任何一个特定事件。
* 通过 `embeddingSpliter`,一个 Feed 的文本内容可以被切分成一个或多个语义相对连贯的 **文本块 (Chunks)**。这种切分有助于每个 chunk 聚焦于更具体的主题或信息点。
* 每个 Chunk 会被送入 LLM 生成一个 **向量嵌入 (vector embedding)**
* 因此,一个 Feed 节点在索引中会关联**一组向量 (vectors `[][]float32`)**,每个子向量代表其一个 Chunk 的语义。
* **Embedding**:
* Embedding 是一个由浮点数组成的向量,由大语言模型 (LLM) 生成。它能够捕捉文本片段的语义信息,使得语义上相似的文本在向量空间中距离更近。
* `vector.Index` 存储和比较的就是这些 embeddings。
* **HNSW (Hierarchical Navigable Small World)**:
* `vector.Index` 使用 HNSW 作为其底层的近似最近邻 (ANN) 搜索算法。
* HNSW 通过构建一个多层的图结构来实现高效搜索。上层图更稀疏,用于快速导航;下层图更密集,用于精确查找。
* 这种结构使得索引在插入新节点和执行搜索时都能保持较好的性能。
* **相似度计算 (Similarity Score)**:
* **Feed 间相似度 (Inter-Feed Similarity)**:
* 当评估 HNSW 图中两个 Feed 节点(例如,`nodeA``nodeB`)之间的相似度时,策略是计算 `nodeA` 的所有 Chunk 向量与 `nodeB` 的所有 Chunk 向量之间的两两余弦相似度。
* 最终,这两个 Feed 节点间的相似度取所有这些两两 Chunk 相似度中的**最大值 (Maximal Local Similarity)**。
* **选择此策略的原因**: 对于新闻资讯,只要两篇报道中存在任何一对高度相关的片段(例如,都报道了同一核心事件或引用了同一关键信息),就认为这两篇报道具有强关联性。这有助于最大化召回率,确保用户能发现所有可能相关的资讯,即使它们整体侧重点不同。
* **潜在影响**: 这种策略对局部强相关非常敏感,但也可能因为次要内容的偶然相似而将整体主题差异较大的 Feed 判定为相关,需要在上层应用或通过重排序模型来进一步优化精度。
* **查询与 Feed 相似度 (Query-Feed Similarity)**:
* 当用户使用一个查询向量 `q` 进行搜索时,计算 `q` 与目标 Feed 的每一个 Chunk 向量的余弦相似度。
* 该 Feed 最终与查询 `q` 的相似度分数,同样取这些计算结果中的**最大值**。
* 这样做是为了确保只要 Feed 的任何一部分内容与用户查询高度匹配,该 Feed 就会被召回。
## 3. 主要接口
`vector.Index` 提供了一组清晰的接口,用于管理和查询基于 Feed 内容语义的向量索引。
* **`Add(ctx context.Context, id uint64, vectors [][]float32) error`**
* **业务目标**: 将一个新的 Feed 文档及其所有内容块Chunks的向量表示添加到索引中使其能够被后续的相似度搜索发现。
* **核心流程**:
1. **接收 Feed 数据**: 接收 Feed 的唯一 `id` 和代表其所有 Chunks 的 `vectors` 列表。
2. **确定插入策略**: 根据 HNSW 算法的层级构建原则,为该 Feed 节点随机确定一个在多层图结构中的最高插入层级。
3. **查找邻近节点**: 从选定的最高层级开始逐层向下,在每一层利用该层的图结构(和 `EfConstruct` 参数指导下的搜索范围)为新 Feed 节点找到一组最相似的已有 Feed 节点(邻居)。此处的“相似”基于我们定义的“最大局部相似性”——即比较两个 Feed 所有 Chunk 向量对,取其中相似度最高的一对作为这两个 Feed 的相似度。
4. **建立连接**: 如果新 Feed 节点被分配到当前层级,则将其与找到的邻居建立双向连接(朋友关系),并更新其在该层级的友邻列表。
5. **维护图结构**: 在添加连接后,可能会触发友邻剪枝逻辑,以确保每个节点的友邻数量符合配置(`M``2*M`),并尝试维护图的良好连接性,避免产生孤立节点或过度密集的区域。
* **`Search(ctx context.Context, q []float32, threshold float32, limit int) (map[uint64]float32, error)`**
* **业务目标**: 根据用户提供的查询向量 `q`,从索引中高效地检索出语义上最相似的 Feed 列表,并返回它们的 ID 及相似度得分。
* **核心流程**:
1. **接收查询**: 接收查询向量 `q`、相似度阈值 `threshold` 和期望返回的最大结果数 `limit`
2. **导航至目标区域**: 从 HNSW 图的顶层开始,利用稀疏的高层图结构快速定位到与查询向量 `q` 大致相关的区域,逐层向下,每层都找到与 `q` 更近的节点作为下一层的入口。
3. **在底层精确搜索**: 到达最底层的图(第 0 层,包含所有 Feed 节点)后,以上一步得到的入口点为起点,进行一次更细致的扩展搜索(受 `EfSearch` 参数指导的搜索范围)。此搜索旨在找到与查询向量 `q` 的“最大局部相似性”(即 `q` 与 Feed 的所有 Chunk 向量相似度中的最大值)满足 `threshold` 且排名前 `limit` 的 Feed。
4. **返回结果**: 将符合条件的 Feed ID 及其对应的最高相似度分数打包返回。
* **`EncodeTo(ctx context.Context, w io.Writer) error` / `DecodeFrom(ctx context.Context, r io.Reader) error`**
* **业务目标**: 提供索引的持久化能力,允许将内存中的索引状态完整地保存到外部存储(如文件),并在需要时恢复。
* **核心流程 (`EncodeTo`)**:
1. **写入元数据**: 保存索引的配置参数(如 `M`, `Ml`, `EfConstruct`, `EfSearch`)和版本信息。
2. **写入节点数据**: 遍历所有 Feed 节点,依次保存每个节点的 ID、其所有 Chunk 向量(经过量化处理以压缩体积)、以及它在 HNSW 各层级上的友邻关系(友邻 ID 和相似度)。
3. **写入层级结构**: 保存每个层级所包含的节点 ID 列表。
* **核心流程 (`DecodeFrom`)**:
1. **读取元数据**: 恢复索引配置。
2. **重建节点数据**: 读取并重建所有 Feed 节点,包括其 ID、反量化后的 Chunk 向量、以及友邻关系。
3. **重建层级结构**: 恢复 HNSW 的多层图。
## 4. 内部实现细节补充
### 4.1 核心数据表示
* **Feed 节点 (`node`)**: 每个 Feed 在内存中表示为一个 `node` 对象,它不仅存储了 Feed 的 ID 和其所有 Chunk 的向量 (`vectors [][]float32`),还关键地维护了它在 HNSW 图各个层级上的“友邻列表” (`friendsOnLayers`)。这个友邻列表是图连接性的基础。
* **分层图 (`layers`)**: 索引内部维护一个 `layers` 列表,代表 HNSW 的多层结构。高层图节点更少、连接更稀疏用于快速跳转底层图尤其是第0层节点最多、连接最密集用于精确搜索。
* **全局节点池 (`m`)**: 一个从 Feed ID 到 `node` 对象的映射,方便快速访问任何已索引的 Feed。
### 4.2 索引构建的关键机制
* **概率性分层 (`randomInsertLevel`)**: 新加入的 Feed 节点会被随机分配到一个最高层级。这种概率机制(受 `Ml` 参数影响)形成了 HNSW 的金字塔式层级结构。
* **动态邻居选择 (`insertAndLinkAtLevel` 中的搜索逻辑)**: 当一个新 Feed 节点加入某一层时,它会基于“最大局部相似性”在该层搜索一定数量(受 `EfConstruct` 影响)的最近邻居。
* **连接维护与剪枝 (`makeFriend`, `tryRemoveFriend`)**: 与邻居建立双向连接后,为保证图的性能和结构(避免节点拥有过多邻居),会有一套剪枝逻辑。这套逻辑不仅考虑移除相似度最低的连接,有时还会考虑被移除连接的另一端节点的连接状况,试图避免制造“孤岛”节点,甚至在必要时(通过 `tryRemakeFriend`)为连接数过少的节点尝试从“邻居的邻居”中寻找新的连接机会。
### 4.3 存储效率:向量量化
* 为了显著减少索引在持久化存储时占用的空间,`float32` 类型的向量在写入磁盘前会通过 `vectorutil.Quantize` 被转换为 `int8` 类型,并记录下转换所需的最小值和缩放比例。读取时再通过 `vectorutil.Dequantize` 进行有损恢复。这是在存储成本和表示精度之间的一种实用权衡。

View File

@@ -0,0 +1,28 @@
## 0. Check your current version
```bash
# Mac/Linux
docker inspect glidea/zenfeed:latest | grep version
# Windows PowerShell
docker inspect glidea/zenfeed:latest | Select-String -Pattern 'version'
```
If you **don't see any results**, it means you're using version v0.1.0. This is because the first version didn't include version information. Therefore, **this document applies to you.**
## 1. Move your data to the correct volume path
```bash
docker-compose -p zenfeed exec zenfeed cp -a /data/. /app/data/
```
## 2. Backup your config
Access: http://localhost:1400
![](images/upgrade-from-v0.1.0-backup.png)
## 3. Upgrade
See [upgrade](./upgrade.md)
## 4. Resave your config
Access: http://localhost:1400
Resave your config.
These tedious steps are due to the oversight in the deployment form of the first version, and I apologize for that. Subsequent versions will not require these extra steps.

19
docs/upgrade.md Normal file
View File

@@ -0,0 +1,19 @@
**NOTE:** If you are upgrading from v0.1.0, which is the first version, please refer to [upgrade-from-v0.1.0.md](./upgrade-from-v0.1.0.md)
```bash
# Ensure compose yml up to date.
## Mac/Linux
curl -L -O https://raw.githubusercontent.com/glidea/zenfeed/main/docker-compose.yml
## Windows PowerShell
Invoke-WebRequest -Uri "https://raw.githubusercontent.com/glidea/zenfeed/main/docker-compose.yml" -OutFile ([System.IO.Path]::GetFileName("https://raw.githubusercontent.com/glidea/zenfeed/main/docker-compose.yml"))
# Ensure images up to date.
docker-compose -p zenfeed pull
# Upgrading without reconfiguring, etc APIKey.
docker-compose -p zenfeed up -d
```
Then all the feed data and configurations should be intact.

148
docs/webhook-zh.md Normal file
View File

@@ -0,0 +1,148 @@
# Zenfeed Webhook 通知对接指南
Zenfeed 支持通过 Webhook 将分组和总结后的 Feed 通知推送到您指定的 HTTP(S) 端点。这允许您将 Zenfeed 的通知集成到自定义的应用或工作流程中。
## 1. 配置方法
要在 Zenfeed 中配置 Webhook 通知,您需要在配置文件的 `notify.receivers` 部分定义一个或多个接收者,并为每个 Webhook 接收者指定其唯一的 `name``webhook` 配置块。
**示例配置 (`config.yaml`):**
```yaml
notify:
# ... 其他通知配置 ...
receivers:
- name: my_awesome_webhook # 接收者的唯一名称,将在路由规则中引用
webhook:
url: "https://your-service.com/webhook-endpoint" # 您的 Webhook 接收端点 URL
# 示例:路由规则中如何使用此接收者
route: # or sub_routes..
receivers:
- my_awesome_webhook # 引用上面定义的接收者名称
# ... 其他路由配置 ...
```
在上述示例中:
- 我们定义了一个名为 `my_awesome_webhook` 的接收者。
- `webhook.url` 字段指定了当有匹配此接收者的通知时Zenfeed 将向哪个 URL 发送 POST 请求。
## 2. 数据格式详解
当 Zenfeed 向您的 Webhook 端点发送通知时,它会发送一个 `POST` 请求,请求体为 JSON 格式。
请求体结构如下:
```json
{
"group": "string",
"labels": {
"label_key1": "label_value1",
"label_key2": "label_value2"
},
"summary": "string",
"feeds": [
{
"labels": {
"title": "Feed Title 1",
"link": "http://example.com/feed1",
"content": "Feed content snippet 1...",
"source": "example_source",
"pub_time": "2024-07-30T10:00:00Z"
// ... 其他自定义或标准标签
},
"time": "2024-07-30T10:00:00Z",
"related": [
// 可选:与此 Feed 相关的其他 Feed 对象,结构同父 Feed
]
}
// ...更多 Feed 对象
]
}
```
**字段说明:**
- `group` (`string`):
当前通知所属的组名。这个名称是根据通知路由配置中 `group_by` 定义的标签值组合而成的。例如,如果 `group_by: ["source", "category"]`,且一个 Feed 组的 `source``github_trending``category``golang`,那么 `group` 可能类似于 `"github_trending/golang"`
- `labels` (`object`):
一个键值对对象,表示当前通知组的标签。这些标签是根据通知路由配置中 `group_by` 所指定的标签及其对应的值。
例如,如果 `group_by: ["source"]` 且当前组的 `source` 标签值为 `rsshub`,则 `labels` 会是 `{"source": "rsshub"}`
- `summary` (`string`):
由大语言模型 (LLM) 为当前这一组 Feed 生成的摘要文本。如果通知路由中没有配置 LLM 总结,此字段可能为空字符串或省略 (取决于具体的实现细节,但通常会是空字符串)。
- `feeds` (`array` of `object`):
一个数组,包含了属于当前通知组的所有 Feed 对象。每个 Feed 对象包含以下字段:
* `labels` (`object`): Feed 的元数据。这是一个键值对对象,包含了该 Feed 的所有标签,例如:
* `title` (`string`): Feed 的标题。
* `link` (`string`): Feed 的原始链接。
* `content` (`string`): Feed 的内容摘要或全文 (取决于抓取和重写规则)。
* `source` (`string`): Feed 的来源标识。
* `pub_time` (`string`): Feed 的发布时间 (RFC3339 格式的字符串,例如 `2025-01-01T00:00:00Z`)。
* ...以及其他在抓取或重写过程中添加的自定义标签。
* `time` (`string`): Feed 的时间戳,通常是其发布时间,采用 RFC3339 格式 (例如 `2025-01-01T00:00:00Z`)。此字段与 `labels.pub_time` 通常一致,但 `time` 是系统内部用于时间序列处理的主要时间字段。
* `related` (`array` of `object`, 可选):
一个数组,包含了与当前 Feed 语义相关的其他 Feed 对象。这通常在通知路由中启用了 `compress_by_related_threshold` 选项时填充。每个相关的 Feed 对象结构与父 Feed 对象完全相同。如果未启用相关性压缩或没有相关的 Feed此字段可能为空数组或不存在。
## 3. 请求示例
以下是一个发送到您的 Webhook 端点的 JSON 请求体示例:
```json
{
"group": "my_favorite_blogs",
"labels": {
"category": "tech_updates",
},
"summary": "今天有多篇关于最新 AI 技术进展的文章,重点关注了大型语言模型在代码生成方面的应用,以及其对未来软件开发模式的潜在影响。",
"feeds": [
{
"labels": {
"content": "AlphaCode X 展示了惊人的代码理解和生成能力,在多个编程竞赛中超越了人类平均水平...",
"link": "https://example.blog/alphacode-x-details",
"pub_time": "2024-07-30T14:35:10Z",
"source": "Example Tech Blog",
"title": "AlphaCode X: 下一代 AI 编码助手",
"type": "blog_post"
},
"time": "2024-07-30T14:35:10Z",
"related": []
},
{
"labels": {
"content": "讨论了当前 LLM 在实际软件工程项目中落地所面临的挑战,包括成本、可控性和安全性问题。",
"link": "https://another.blog/llm-in-swe-challenges",
"pub_time": "2024-07-30T11:15:00Z",
"source": "Another Tech Review",
"title": "LLM 在软件工程中的应用:机遇与挑战",
"type": "rss"
},
"time": "2024-07-30T11:15:00Z",
"related": [
{
"labels": {
"content": "一篇关于如何更经济有效地部署和微调大型语言模型的指南。",
"link": "https://some.other.blog/cost-effective-llm",
"pub_time": "2024-07-30T09:00:00Z",
"source": "AI Infra Weekly",
"title": "经济高效的 LLM 部署策略",
"type": "rss"
},
"time": "2024-07-30T09:00:00Z",
"related": []
}
]
}
]
}
```
## 4. 响应要求
Zenfeed 期望您的 Webhook 端点在成功接收并处理通知后,返回 HTTP `200 OK` 状态码。
如果 Zenfeed 收到任何非 `200` 的状态码,它会将该次通知尝试标记为失败,并可能根据重试策略进行重试 (具体重试行为取决于 Zenfeed 的内部实现)。
请确保您的端点能够及时响应,以避免超时。

15
go.mod
View File

@@ -7,14 +7,17 @@ require (
github.com/benbjohnson/clock v1.3.5
github.com/chewxy/math32 v1.10.1
github.com/edsrzf/mmap-go v1.2.0
github.com/gorilla/feeds v1.2.0
github.com/mark3labs/mcp-go v0.17.0
github.com/minio/minio-go/v7 v7.0.94
github.com/mmcdole/gofeed v1.3.0
github.com/nutsdb/nutsdb v1.0.4
github.com/onsi/gomega v1.36.1
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.21.1
github.com/sashabaranov/go-openai v1.36.1
github.com/sashabaranov/go-openai v1.40.1
github.com/stretchr/testify v1.10.0
github.com/temoto/robotstxt v1.1.2
github.com/veqryn/slog-dedup v0.5.0
github.com/yuin/goldmark v1.7.8
gopkg.in/gomail.v2 v2.0.0-20160411212932-81ebce5c23df
@@ -31,24 +34,34 @@ require (
github.com/bwmarrin/snowflake v0.3.0 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/go-ini/ini v1.67.0 // indirect
github.com/goccy/go-json v0.10.5 // indirect
github.com/gofrs/flock v0.8.1 // indirect
github.com/google/go-cmp v0.7.0 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.18.0 // indirect
github.com/klauspost/cpuid/v2 v2.2.10 // indirect
github.com/minio/crc64nvme v1.0.1 // indirect
github.com/minio/md5-simd v1.1.2 // indirect
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.62.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/rs/xid v1.6.0 // indirect
github.com/stretchr/objx v0.5.2 // indirect
github.com/tidwall/btree v1.6.0 // indirect
github.com/tinylib/msgp v1.3.0 // indirect
github.com/xujiajun/mmap-go v1.0.1 // indirect
github.com/xujiajun/utils v0.0.0-20220904132955-5f7c5b914235 // indirect
github.com/yosida95/uritemplate/v3 v3.0.2 // indirect
golang.org/x/crypto v0.36.0 // indirect
golang.org/x/net v0.38.0 // indirect
golang.org/x/sys v0.31.0 // indirect
golang.org/x/text v0.23.0 // indirect

34
go.sum
View File

@@ -21,12 +21,18 @@ github.com/chewxy/math32 v1.10.1/go.mod h1:dOB2rcuFrCn6UHrze36WSLVPKtzPMRAQvBvUw
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/edsrzf/mmap-go v1.2.0 h1:hXLYlkbaPzt1SaQk+anYwKSRNhufIDCchSPkUD6dD84=
github.com/edsrzf/mmap-go v1.2.0/go.mod h1:19H/e8pUPLicwkyNgOykDXkJ9F0MHE+Z52B8EIth78Q=
github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A=
github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8=
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
github.com/goccy/go-json v0.10.5 h1:Fq85nIqj+gXn/S5ahsiTlK3TmC85qgirsdTP/+DeaC4=
github.com/goccy/go-json v0.10.5/go.mod h1:oq7eo15ShAhp70Anwd5lgX2pLfOS3QCiwU/PULtXL6M=
github.com/gofrs/flock v0.8.1 h1:+gYjHKf32LDeiEEFhQaotPbLuUXjY5ZqxKgXy7n59aw=
github.com/gofrs/flock v0.8.1/go.mod h1:F1TvTiK9OcQqauNUHlbJvyl9Qa1QvF/gOUDKA14jxHU=
github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
@@ -36,20 +42,32 @@ github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5 h1:5iH8iuqE5apketRbSF
github.com/google/pprof v0.0.0-20240827171923-fa2c70bbbfe5/go.mod h1:vavhavw2zAxS5dIdcRluK6cSGGPlZynqzFM8NdvU144=
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
github.com/gorilla/feeds v1.2.0 h1:O6pBiXJ5JHhPvqy53NsjKOThq+dNFm8+DFrxBEdzSCc=
github.com/gorilla/feeds v1.2.0/go.mod h1:WMib8uJP3BbY+X8Szd1rA5Pzhdfh+HCCAYT2z7Fza6Y=
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.2.10 h1:tBs3QSyvjDyFTq3uoc/9xFpCuOsJQFNPiAhYdw2skhE=
github.com/klauspost/cpuid/v2 v2.2.10/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/mark3labs/mcp-go v0.17.0 h1:5Ps6T7qXr7De/2QTqs9h6BKeZ/qdeUeGrgM5lPzi930=
github.com/mark3labs/mcp-go v0.17.0/go.mod h1:KmJndYv7GIgcPVwEKJjNcbhVQ+hJGJhrCCB/9xITzpE=
github.com/minio/crc64nvme v1.0.1 h1:DHQPrYPdqK7jQG/Ls5CTBZWeex/2FMS3G5XGkycuFrY=
github.com/minio/crc64nvme v1.0.1/go.mod h1:eVfm2fAzLlxMdUGc0EEBGSMmPwmXD5XiNRpnu9J3bvg=
github.com/minio/md5-simd v1.1.2 h1:Gdi1DZK69+ZVMoNHRXJyNcxrMA4dSxoYHZSQbirFg34=
github.com/minio/md5-simd v1.1.2/go.mod h1:MzdKDxYpY2BT9XQFocsiZf/NKVtR7nkE4RoEpN+20RM=
github.com/minio/minio-go/v7 v7.0.94 h1:1ZoksIKPyaSt64AVOyaQvhDOgVC3MfZsWM6mZXRUGtM=
github.com/minio/minio-go/v7 v7.0.94/go.mod h1:71t2CqDt3ThzESgZUlU1rBN54mksGGlkLcFgguDnnAc=
github.com/mmcdole/gofeed v1.3.0 h1:5yn+HeqlcvjMeAI4gu6T+crm7d0anY85+M+v6fIFNG4=
github.com/mmcdole/gofeed v1.3.0/go.mod h1:9TGv2LcJhdXePDzxiuMnukhV2/zb6VtnZt1mS+SjkLE=
github.com/mmcdole/goxpp v1.1.1-0.20240225020742-a0c311522b23 h1:Zr92CAlFhy2gL+V1F+EyIuzbQNbSgP4xhTODZtrXUtk=
@@ -67,6 +85,8 @@ github.com/onsi/ginkgo/v2 v2.20.1 h1:YlVIbqct+ZmnEph770q9Q7NVAz4wwIiVNahee6JyUzo
github.com/onsi/ginkgo/v2 v2.20.1/go.mod h1:lG9ey2Z29hR41WMVthyJBGUBcBhGOtoPF2VFMvBXFCI=
github.com/onsi/gomega v1.36.1 h1:bJDPBO7ibjxcbHMgSCoo4Yj18UWbKDlLwX1x9sybDcw=
github.com/onsi/gomega v1.36.1/go.mod h1:PvZbdDc8J6XJEpDK4HCuRBm8a6Fzp9/DmhC9C7yFlog=
github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c h1:dAMKvw0MlJT1GshSTtih8C2gDs04w8dReiOGXrGLNoY=
github.com/philhofer/fwd v1.1.3-0.20240916144458-20a13a1f6b7c/go.mod h1:RqIHx9QI14HlwKwm98g9Re5prTQ6LdeRQn+gXJFxsJM=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
@@ -84,8 +104,10 @@ github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0 h1:OdAsTTz6O
github.com/remyoudompheng/bigfft v0.0.0-20200410134404-eec4a21b6bb0/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/rogpeppe/go-internal v1.10.0 h1:TMyTOH3F/DB16zRVcYyreMH6GnZZrwQVAoYjRBZyWFQ=
github.com/rogpeppe/go-internal v1.10.0/go.mod h1:UQnix2H7Ngw/k4C5ijL5+65zddjncjaFoBhdsK/akog=
github.com/sashabaranov/go-openai v1.36.1 h1:EVfRXwIlW2rUzpx6vR+aeIKCK/xylSrVYAx1TMTSX3g=
github.com/sashabaranov/go-openai v1.36.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
github.com/rs/xid v1.6.0 h1:fV591PaemRlL6JfRxGDEPl69wICngIQ3shQtzfy2gxU=
github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0=
github.com/sashabaranov/go-openai v1.40.1 h1:bJ08Iwct5mHBVkuvG6FEcb9MDTfsXdTYPGjYLRdeTEU=
github.com/sashabaranov/go-openai v1.40.1/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
github.com/sebdah/goldie/v2 v2.5.3 h1:9ES/mNN+HNUbNWpVAlrzuZ7jE+Nrczbj8uFRjM7624Y=
github.com/sebdah/goldie/v2 v2.5.3/go.mod h1:oZ9fp0+se1eapSRjfYbsV/0Hqhbuu3bJVvKI/NNtssI=
github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
@@ -99,8 +121,12 @@ github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81P
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/temoto/robotstxt v1.1.2 h1:W2pOjSJ6SWvldyEuiFXNxz3xZ8aiWX5LbfDiOFd7Fxg=
github.com/temoto/robotstxt v1.1.2/go.mod h1:+1AmkuG3IYkh1kv0d2qEB9Le88ehNO0zwOr3ujewlOo=
github.com/tidwall/btree v1.6.0 h1:LDZfKfQIBHGHWSwckhXI0RPSXzlo+KYdjK7FWSqOzzg=
github.com/tidwall/btree v1.6.0/go.mod h1:twD9XRA5jj9VUQGELzDO4HPQTNJsoWWfYEL+EUQ2cKY=
github.com/tinylib/msgp v1.3.0 h1:ULuf7GPooDaIlbyvgAxBV/FI7ynli6LZ1/nVUNu+0ww=
github.com/tinylib/msgp v1.3.0/go.mod h1:ykjzy2wzgrlvpDCRc4LA8UXy6D8bzMSuAF3WD57Gok0=
github.com/veqryn/slog-dedup v0.5.0 h1:2pc4va3q8p7Tor1SjVvi1ZbVK/oKNPgsqG15XFEt0iM=
github.com/veqryn/slog-dedup v0.5.0/go.mod h1:/iQU008M3qFa5RovtfiHiODxJFvxZLjWRG/qf/zKFHw=
github.com/xujiajun/mmap-go v1.0.1 h1:7Se7ss1fLPPRW+ePgqGpCkfGIZzJV6JPq9Wq9iv/WHc=
@@ -118,6 +144,8 @@ golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5y
golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU=
golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8=
golang.org/x/crypto v0.36.0 h1:AnAEvhDddvBdpY+uR+MyHmuZzzNqXSe/GvuDeob5L34=
golang.org/x/crypto v0.36.0/go.mod h1:Y4J0ReaxCR1IMaabaSMugxJES1EpwhBHhv2bDHklZvc=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=

View File

@@ -1,25 +0,0 @@
timezone: Asia/Shanghai
llms:
- name: general
default: true
provider: siliconflow
model: Qwen/Qwen2.5-7B-Instruct
- name: embed
provider: siliconflow
embedding_model: Pro/BAAI/bge-m3
scrape:
rsshub_endpoint: http://rsshub:1200
storage:
feed:
rewrites:
- transform:
to_text:
prompt: |
{{ .summary_html_snippet }}
label: summary_html_snippet
embedding_llm: embed
notify:
channels:
email:
feed_html_snippet_template: |
{{ .summary_html_snippet }}

View File

@@ -1,24 +0,0 @@
version: "3.8"
services:
zenfeed:
image: glidea/zenfeed:latest
volumes:
- data:/app/data
- type: bind
source: ./config
target: /app/config
ports:
- "1300:1300"
- "1301:1301"
depends_on:
- rsshub
rsshub:
image: diygod/rsshub:latest
ports:
- "1200:1200"
environment:
- NODE_ENV=production
volumes:
data: {}

View File

@@ -1,131 +0,0 @@
#!/bin/bash
YQ_IMAGE="mikefarah/yq:latest"
template_source=""
values_args=()
# --- Parse command line arguments ---
while [[ $# -gt 0 ]]; do
key="$1"
case $key in
--template)
template_source="$2"
shift # past argument
shift # past value
;;
--values)
# Collect all arguments after --values until next -- argument or end
shift # past --values
while [[ $# -gt 0 ]] && [[ ! "$1" =~ ^-- ]]; do
values_args+=("$1")
shift # past value argument
done
;;
*) # Unknown option
echo "Error: Unknown option $1" >&2
exit 1
;;
esac
done
# --- Get template content ---
current_yaml=""
if [[ -z "$template_source" ]]; then
# If no template provided, start with empty YAML
current_yaml="{}"
elif [[ "$template_source" =~ ^https?:// ]]; then
# Download from URL
# Use curl, exit if fails
if ! command -v curl &> /dev/null; then
echo "Error: curl command required to download URL template." >&2
exit 1
fi
template_content=$(curl -sfL "$template_source")
if [[ $? -ne 0 ]]; then
echo "Error: Failed to download template from URL: $template_source" >&2
exit 1
fi
# Check if downloaded content is empty
if [[ -z "$template_content" ]]; then
current_yaml="{}"
else
current_yaml="$template_content"
fi
elif [[ -f "$template_source" ]]; then
# Read from local file
current_yaml=$(cat "$template_source")
# Check if file content is empty
if [[ -z "$current_yaml" ]]; then
current_yaml="{}"
fi
else
# Invalid template source
echo "Error: Invalid template source '$template_source'. Please provide valid file path or HTTP/HTTPS URL." >&2
exit 1
fi
# --- Check if Docker is available ---
if ! command -v docker &> /dev/null; then
echo "Error: docker command required to run yq." >&2
exit 1
fi
# Try pulling or verifying yq image (helps catch issues early)
docker pull $YQ_IMAGE > /dev/null
# --- Apply values ---
if [[ ${#values_args[@]} -gt 0 ]]; then
for val_arg in "${values_args[@]}"; do
# Parse key=value
if [[ ! "$val_arg" =~ ^([^=]+)=(.*)$ ]]; then
continue
fi
# BASH_REMATCH is result array from =~ operator
yaml_path="${BASH_REMATCH[1]}"
raw_value="${BASH_REMATCH[2]}"
# Prepare yq value (try handling basic types, otherwise treat as string)
yq_value=""
if [[ "$raw_value" == "true" || "$raw_value" == "false" || "$raw_value" == "null" ]]; then
yq_value="$raw_value"
# Check if integer or float (simple regex)
elif [[ "$raw_value" =~ ^-?[0-9]+(\.[0-9]+)?$ ]]; then
# If value starts with 0 but isn't 0 itself and has no decimal point, force string to prevent octal interpretation
if [[ "$raw_value" =~ ^0[0-9]+$ ]]; then
# Need to escape internal double quotes
escaped_value=$(echo "$raw_value" | sed 's/"/\\"/g')
yq_value="\"$escaped_value\""
else
yq_value="$raw_value"
fi
else
# Treat as string, need to escape internal double quotes
escaped_value=$(echo "$raw_value" | sed 's/"/\\"/g')
yq_value="\"$escaped_value\""
fi
# Build yq expression
yq_expression=".$yaml_path = $yq_value"
# Apply update via docker run yq
# Pass current YAML via stdin to yq, get stdout as new YAML
# Use <<< for here-string input to avoid temp files
new_yaml=$(docker run --rm -i "$YQ_IMAGE" "$yq_expression" <<< "$current_yaml")
yq_exit_code=$?
if [[ $yq_exit_code -ne 0 ]]; then
echo "Error: yq execution failed (exit code: $yq_exit_code). Expression: '$yq_expression'" >&2
# Could output yq error message, but requires more complex docker run call to capture stderr
exit 1
fi
current_yaml="$new_yaml"
done
fi
# --- Output final result ---
printf "%s\n" "$current_yaml"
exit 0

86
main.go
View File

@@ -28,6 +28,7 @@ import (
"github.com/glidea/zenfeed/pkg/api"
"github.com/glidea/zenfeed/pkg/api/http"
"github.com/glidea/zenfeed/pkg/api/mcp"
"github.com/glidea/zenfeed/pkg/api/rss"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
"github.com/glidea/zenfeed/pkg/llm"
@@ -46,7 +47,9 @@ import (
"github.com/glidea/zenfeed/pkg/storage/feed/block/index/primary"
"github.com/glidea/zenfeed/pkg/storage/feed/block/index/vector"
"github.com/glidea/zenfeed/pkg/storage/kv"
"github.com/glidea/zenfeed/pkg/storage/object"
"github.com/glidea/zenfeed/pkg/telemetry/log"
telemetryserver "github.com/glidea/zenfeed/pkg/telemetry/server"
timeutil "github.com/glidea/zenfeed/pkg/util/time"
)
@@ -118,18 +121,21 @@ type App struct {
configPath string
configMgr config.Manager
conf *config.App
telemetry telemetryserver.Server
kvStorage kv.Storage
llmFactory llm.Factory
rewriter rewrite.Rewriter
feedStorage feed.Storage
api api.API
http http.Server
mcp mcp.Server
scraperMgr scrape.Manager
scheduler schedule.Scheduler
notifier notify.Notifier
notifyChan chan *rule.Result
kvStorage kv.Storage
llmFactory llm.Factory
rewriter rewrite.Rewriter
feedStorage feed.Storage
objectStorage object.Storage
api api.API
http http.Server
mcp mcp.Server
rss rss.Server
scraperMgr scrape.Manager
scheduler schedule.Scheduler
notifier notify.Notifier
notifyChan chan *rule.Result
}
// newApp creates a new application instance.
@@ -153,9 +159,16 @@ func (a *App) setup() error {
return a.applyGlobals(newConf)
}))
if err := a.setupTelemetryServer(); err != nil {
return errors.Wrap(err, "setup telemetry server")
}
if err := a.setupKVStorage(); err != nil {
return errors.Wrap(err, "setup kv storage")
}
if err := a.setupObjectStorage(); err != nil {
return errors.Wrap(err, "setup object storage")
}
if err := a.setupLLMFactory(); err != nil {
return errors.Wrap(err, "setup llm factory")
}
@@ -174,6 +187,9 @@ func (a *App) setup() error {
if err := a.setupMCPServer(); err != nil {
return errors.Wrap(err, "setup mcp server")
}
if err := a.setupRSSServer(); err != nil {
return errors.Wrap(err, "setup rss server")
}
if err := a.setupScraper(); err != nil {
return errors.Wrap(err, "setup scraper")
}
@@ -209,8 +225,8 @@ func (a *App) applyGlobals(conf *config.App) error {
if err := timeutil.SetLocation(conf.Timezone); err != nil {
return errors.Wrapf(err, "set timezone to %s", conf.Timezone)
}
if err := log.SetLevel(log.Level(conf.Log.Level)); err != nil {
return errors.Wrapf(err, "set log level to %s", conf.Log.Level)
if err := log.SetLevel(log.Level(conf.Telemetry.Log.Level)); err != nil {
return errors.Wrapf(err, "set log level to %s", conf.Telemetry.Log.Level)
}
return nil
@@ -240,7 +256,8 @@ func (a *App) setupLLMFactory() (err error) {
// setupRewriter initializes the Rewriter factory.
func (a *App) setupRewriter() (err error) {
a.rewriter, err = rewrite.NewFactory().New(component.Global, a.conf, rewrite.Dependencies{
LLMFactory: a.llmFactory,
LLMFactory: a.llmFactory,
ObjectStorage: a.objectStorage,
})
if err != nil {
return err
@@ -271,6 +288,28 @@ func (a *App) setupFeedStorage() (err error) {
return nil
}
// setupObjectStorage initializes the Object storage.
func (a *App) setupObjectStorage() (err error) {
a.objectStorage, err = object.NewFactory().New(component.Global, a.conf, object.Dependencies{})
if err != nil {
return err
}
a.configMgr.Subscribe(a.objectStorage)
return nil
}
// setupTelemetryServer initializes the Telemetry server.
func (a *App) setupTelemetryServer() (err error) {
a.telemetry, err = telemetryserver.NewFactory().New(component.Global, a.conf, telemetryserver.Dependencies{})
if err != nil {
return err
}
return nil
}
// setupAPI initializes the API service.
func (a *App) setupAPI() (err error) {
a.api, err = api.NewFactory().New(component.Global, a.conf, api.Dependencies{
@@ -315,6 +354,20 @@ func (a *App) setupMCPServer() (err error) {
return nil
}
// setupRSSServer initializes the RSS server.
func (a *App) setupRSSServer() (err error) {
a.rss, err = rss.NewFactory().New(component.Global, a.conf, rss.Dependencies{
API: a.api,
})
if err != nil {
return err
}
a.configMgr.Subscribe(a.rss)
return nil
}
// setupScraper initializes the Scraper manager.
func (a *App) setupScraper() (err error) {
a.scraperMgr, err = scrape.NewFactory().New(component.Global, a.conf, scrape.Dependencies{
@@ -355,6 +408,7 @@ func (a *App) setupNotifier() (err error) {
RouterFactory: route.NewFactory(),
ChannelFactory: channel.NewFactory(),
KVStorage: a.kvStorage,
LLMFactory: a.llmFactory,
})
if err != nil {
return err
@@ -383,12 +437,12 @@ func (a *App) run(ctx context.Context) error {
log.Info(ctx, "starting application components...")
if err := component.Run(ctx,
component.Group{a.configMgr},
component.Group{a.llmFactory},
component.Group{a.llmFactory, a.objectStorage, a.telemetry},
component.Group{a.rewriter},
component.Group{a.feedStorage},
component.Group{a.kvStorage},
component.Group{a.notifier, a.api},
component.Group{a.http, a.mcp, a.scraperMgr, a.scheduler},
component.Group{a.http, a.mcp, a.rss, a.scraperMgr, a.scheduler},
); err != nil && !errors.Is(err, context.Canceled) {
return err
}

View File

@@ -37,7 +37,6 @@ import (
telemetry "github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
jsonschema "github.com/glidea/zenfeed/pkg/util/json_schema"
"github.com/glidea/zenfeed/pkg/util/rpc"
)
// --- Interface code block ---
@@ -138,7 +137,7 @@ type QueryRSSHubRoutesResponse struct {
type RSSHubRoute struct {
Name string `json:"name,omitempty"`
Description string `json:"description,omitempty"`
Path string `json:"path,omitempty"`
Path any `json:"path,omitempty"`
Example string `json:"example,omitempty"`
Parameters map[string]any `json:"parameters,omitempty"`
Features map[string]any `json:"features,omitempty"`
@@ -161,11 +160,11 @@ type QueryRequest struct {
}
func (r *QueryRequest) Validate() error { //nolint:cyclop
if r.Query != "" && utf8.RuneCountInString(r.Query) < 5 {
return errors.New("query must be at least 5 characters")
if r.Query != "" && utf8.RuneCountInString(r.Query) > 64 {
return errors.New("query must be at most 64 characters")
}
if r.Threshold == 0 {
r.Threshold = 0.55
r.Threshold = 0.5
}
if r.Threshold < 0 || r.Threshold > 1 {
return errors.New("threshold must be between 0 and 1")
@@ -200,6 +199,28 @@ type QueryResponse struct {
Count int `json:"count"`
}
type Error struct {
Code int `json:"code"`
Message string `json:"message"`
}
func (e Error) Error() string {
return e.Message
}
func newError(code int, err error) Error {
return Error{
Code: code,
Message: err.Error(),
}
}
var (
ErrBadRequest = func(err error) Error { return newError(http.StatusBadRequest, err) }
ErrNotFound = func(err error) Error { return newError(http.StatusNotFound, err) }
ErrInternal = func(err error) Error { return newError(http.StatusInternalServerError, err) }
)
// --- Factory code block ---
type Factory component.Factory[API, config.App, Dependencies]
@@ -262,7 +283,7 @@ func (a *api) QueryAppConfigSchema(
) (resp *QueryAppConfigSchemaResponse, err error) {
schema, err := jsonschema.ForType(reflect.TypeOf(config.App{}))
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query app config schema"))
return nil, ErrInternal(errors.Wrap(err, "query app config schema"))
}
return (*QueryAppConfigSchemaResponse)(&schema), nil
@@ -282,7 +303,7 @@ func (a *api) ApplyAppConfig(
req *ApplyAppConfigRequest,
) (resp *ApplyAppConfigResponse, err error) {
if err := a.Dependencies().ConfigManager.SaveAppConfig(&req.App); err != nil {
return nil, rpc.ErrBadRequest(errors.Wrap(err, "save app config"))
return nil, ErrBadRequest(errors.Wrap(err, "save app config"))
}
return &ApplyAppConfigResponse{}, nil
@@ -297,20 +318,20 @@ func (a *api) QueryRSSHubCategories(
// New request.
forwardReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "new request"))
return nil, ErrInternal(errors.Wrap(err, "new request"))
}
// Do request.
forwardRespIO, err := a.hc.Do(forwardReq)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query rss hub websites"))
return nil, ErrInternal(errors.Wrap(err, "query rss hub websites"))
}
defer func() { _ = forwardRespIO.Body.Close() }()
// Parse response.
var forwardResp map[string]RSSHubWebsite
if err := json.NewDecoder(forwardRespIO.Body).Decode(&forwardResp); err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "parse response"))
return nil, ErrInternal(errors.Wrap(err, "parse response"))
}
// Convert to response.
@@ -333,7 +354,7 @@ func (a *api) QueryRSSHubWebsites(
ctx context.Context, req *QueryRSSHubWebsitesRequest,
) (resp *QueryRSSHubWebsitesResponse, err error) {
if req.Category == "" {
return nil, rpc.ErrBadRequest(errors.New("category is required"))
return nil, ErrBadRequest(errors.New("category is required"))
}
url := a.Config().RSSHubEndpoint + "/api/category/" + req.Category
@@ -341,29 +362,29 @@ func (a *api) QueryRSSHubWebsites(
// New request.
forwardReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "new request"))
return nil, ErrInternal(errors.Wrap(err, "new request"))
}
// Do request.
forwardRespIO, err := a.hc.Do(forwardReq)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query rss hub routes"))
return nil, ErrInternal(errors.Wrap(err, "query rss hub routes"))
}
defer func() { _ = forwardRespIO.Body.Close() }()
// Parse response.
body, err := io.ReadAll(forwardRespIO.Body)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "read response"))
return nil, ErrInternal(errors.Wrap(err, "read response"))
}
if len(body) == 0 {
// Hack for RSSHub...
// Consider cache category ids for validate by self to remove this shit code.
return nil, rpc.ErrBadRequest(errors.New("category id is invalid"))
return nil, ErrBadRequest(errors.New("category id is invalid"))
}
var forwardResp map[string]RSSHubWebsite
if err := json.Unmarshal(body, &forwardResp); err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "parse response"))
return nil, ErrInternal(errors.Wrap(err, "parse response"))
}
// Convert to response.
@@ -383,7 +404,7 @@ func (a *api) QueryRSSHubRoutes(
req *QueryRSSHubRoutesRequest,
) (resp *QueryRSSHubRoutesResponse, err error) {
if req.WebsiteID == "" {
return nil, rpc.ErrBadRequest(errors.New("website id is required"))
return nil, ErrBadRequest(errors.New("website id is required"))
}
url := a.Config().RSSHubEndpoint + "/api/namespace/" + req.WebsiteID
@@ -391,30 +412,30 @@ func (a *api) QueryRSSHubRoutes(
// New request.
forwardReq, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "new request"))
return nil, ErrInternal(errors.Wrap(err, "new request"))
}
// Do request.
forwardRespIO, err := a.hc.Do(forwardReq)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query rss hub routes"))
return nil, ErrInternal(errors.Wrap(err, "query rss hub routes"))
}
defer func() { _ = forwardRespIO.Body.Close() }()
// Parse response.
body, err := io.ReadAll(forwardRespIO.Body)
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "read response"))
return nil, ErrInternal(errors.Wrap(err, "read response"))
}
if len(body) == 0 {
return nil, rpc.ErrBadRequest(errors.New("website id is invalid"))
return nil, ErrBadRequest(errors.New("website id is invalid"))
}
var forwardResp struct {
Routes map[string]RSSHubRoute `json:"routes"`
}
if err := json.Unmarshal(body, &forwardResp); err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "parse response"))
return nil, ErrInternal(errors.Wrap(err, "parse response"))
}
// Convert to response.
@@ -435,7 +456,7 @@ func (a *api) Write(ctx context.Context, req *WriteRequest) (resp *WriteResponse
feed.Labels.Put(model.LabelType, "api", false)
}
if err := a.Dependencies().FeedStorage.Append(ctx, req.Feeds...); err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "append"))
return nil, ErrInternal(errors.Wrap(err, "append"))
}
return &WriteResponse{}, nil
@@ -447,7 +468,7 @@ func (a *api) Query(ctx context.Context, req *QueryRequest) (resp *QueryResponse
// Validate request.
if err := req.Validate(); err != nil {
return nil, rpc.ErrBadRequest(errors.Wrap(err, "validate"))
return nil, ErrBadRequest(errors.Wrap(err, "validate"))
}
// Forward to storage.
@@ -460,7 +481,7 @@ func (a *api) Query(ctx context.Context, req *QueryRequest) (resp *QueryResponse
End: req.End,
})
if err != nil {
return nil, rpc.ErrInternal(errors.Wrap(err, "query"))
return nil, ErrInternal(errors.Wrap(err, "query"))
}
if len(feeds) == 0 {
return &QueryResponse{Feeds: []*block.FeedVO{}}, nil

View File

@@ -26,9 +26,8 @@ import (
"github.com/glidea/zenfeed/pkg/config"
telemetry "github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
"github.com/glidea/zenfeed/pkg/telemetry/metric"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
"github.com/glidea/zenfeed/pkg/util/rpc"
"github.com/glidea/zenfeed/pkg/util/jsonrpc"
)
// --- Interface code block ---
@@ -89,18 +88,14 @@ func new(instance string, app *config.App, dependencies Dependencies) (Server, e
router := http.NewServeMux()
api := dependencies.API
router.Handle("/metrics", metric.Handler())
router.Handle("/health", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
}))
router.Handle("/write", rpc.API(api.Write))
router.Handle("/query_config", rpc.API(api.QueryAppConfig))
router.Handle("/apply_config", rpc.API(api.ApplyAppConfig))
router.Handle("/query_config_schema", rpc.API(api.QueryAppConfigSchema))
router.Handle("/query_rsshub_categories", rpc.API(api.QueryRSSHubCategories))
router.Handle("/query_rsshub_websites", rpc.API(api.QueryRSSHubWebsites))
router.Handle("/query_rsshub_routes", rpc.API(api.QueryRSSHubRoutes))
router.Handle("/query", rpc.API(api.Query))
router.Handle("/write", jsonrpc.API(api.Write))
router.Handle("/query_config", jsonrpc.API(api.QueryAppConfig))
router.Handle("/apply_config", jsonrpc.API(api.ApplyAppConfig))
router.Handle("/query_config_schema", jsonrpc.API(api.QueryAppConfigSchema))
router.Handle("/query_rsshub_categories", jsonrpc.API(api.QueryRSSHubCategories))
router.Handle("/query_rsshub_websites", jsonrpc.API(api.QueryRSSHubWebsites))
router.Handle("/query_rsshub_routes", jsonrpc.API(api.QueryRSSHubRoutes))
router.Handle("/query", jsonrpc.API(api.Query))
httpServer := &http.Server{Addr: config.Address, Handler: router}
return &server{

233
pkg/api/rss/rss.go Normal file
View File

@@ -0,0 +1,233 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package rss
import (
"net"
"net/http"
"text/template"
"time"
"github.com/benbjohnson/clock"
"github.com/gorilla/feeds"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/api"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
"github.com/glidea/zenfeed/pkg/model"
telemetry "github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
"github.com/glidea/zenfeed/pkg/util/buffer"
)
var clk = clock.New()
// --- Interface code block ---
type Server interface {
component.Component
config.Watcher
}
type Config struct {
Address string
ContentHTMLTemplate string
contentHTMLTemplate *template.Template
}
func (c *Config) Validate() error {
if c.Address == "" {
c.Address = ":1302"
}
if _, _, err := net.SplitHostPort(c.Address); err != nil {
return errors.Wrap(err, "invalid address")
}
if c.ContentHTMLTemplate == "" {
c.ContentHTMLTemplate = "{{ .summary_html_snippet }}"
}
t, err := template.New("").Parse(c.ContentHTMLTemplate)
if err != nil {
return errors.Wrap(err, "parse rss content template")
}
c.contentHTMLTemplate = t
return nil
}
func (c *Config) From(app *config.App) *Config {
c.Address = app.API.RSS.Address
c.ContentHTMLTemplate = app.API.RSS.ContentHTMLTemplate
return c
}
type Dependencies struct {
API api.API
}
// --- Factory code block ---
type Factory component.Factory[Server, config.App, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[Server, config.App, Dependencies](
func(instance string, config *config.App, dependencies Dependencies) (Server, error) {
m := &mockServer{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
},
)
}
return component.FactoryFunc[Server, config.App, Dependencies](new)
}
func new(instance string, app *config.App, dependencies Dependencies) (Server, error) {
config := &Config{}
config.From(app)
if err := config.Validate(); err != nil {
return nil, errors.Wrap(err, "validate config")
}
s := &server{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "RSSServer",
Instance: instance,
Config: config,
Dependencies: dependencies,
}),
}
router := http.NewServeMux()
router.Handle("/", http.HandlerFunc(s.rss))
s.http = &http.Server{Addr: config.Address, Handler: router}
return s, nil
}
// --- Implementation code block ---
type server struct {
*component.Base[Config, Dependencies]
http *http.Server
}
func (s *server) Run() (err error) {
ctx := telemetry.StartWith(s.Context(), append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Run")...)
defer func() { telemetry.End(ctx, err) }()
serverErr := make(chan error, 1)
go func() {
serverErr <- s.http.ListenAndServe()
}()
s.MarkReady()
select {
case <-ctx.Done():
log.Info(ctx, "shutting down")
return s.http.Shutdown(ctx)
case err := <-serverErr:
return errors.Wrap(err, "listen and serve")
}
}
func (s *server) Reload(app *config.App) error {
newConfig := &Config{}
newConfig.From(app)
if err := newConfig.Validate(); err != nil {
return errors.Wrap(err, "validate config")
}
if s.Config().Address != newConfig.Address {
return errors.New("address cannot be reloaded")
}
s.SetConfig(newConfig)
return nil
}
func (s *server) rss(w http.ResponseWriter, r *http.Request) {
var err error
ctx := telemetry.StartWith(r.Context(), append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "rss")...)
defer telemetry.End(ctx, err)
// Extract parameters.
ps := r.URL.Query()
labelFilters := ps["label_filter"]
query := ps.Get("query")
// Forward query request to API.
now := clk.Now()
queryResult, err := s.Dependencies().API.Query(ctx, &api.QueryRequest{
Query: query,
LabelFilters: labelFilters,
Start: now.Add(-24 * time.Hour),
End: now,
Limit: 100,
})
if err != nil {
http.Error(w, err.Error(), http.StatusBadRequest) // TODO: standardize error handling.
return
}
// Render and convert to RSS.
rssObj := &feeds.Feed{
Title: "Zenfeed RSS - " + ps.Encode(),
Description: "Powered by Github Zenfeed - https://github.com/glidea/zenfeed. If you use Folo, please enable 'Appearance - Content - Render inline styles'",
Items: make([]*feeds.Item, 0, len(queryResult.Feeds)),
}
buf := buffer.Get()
defer buffer.Put(buf)
for _, feed := range queryResult.Feeds {
buf.Reset()
if err = s.Config().contentHTMLTemplate.Execute(buf, feed.Labels.Map()); err != nil {
http.Error(w, err.Error(), http.StatusInternalServerError)
return
}
item := &feeds.Item{
Title: feed.Labels.Get(model.LabelTitle),
Link: &feeds.Link{Href: feed.Labels.Get(model.LabelLink)},
Created: feed.Time, // NOTE: scrape time, not pub time.
Content: buf.String(),
}
rssObj.Items = append(rssObj.Items, item)
}
if err = rssObj.WriteRss(w); err != nil {
log.Error(ctx, errors.Wrap(err, "write rss response"))
return
}
}
type mockServer struct {
component.Mock
}
func (m *mockServer) Reload(app *config.App) error {
return m.Called(app).Error(0)
}

View File

@@ -30,6 +30,7 @@ import (
"github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
timeutil "github.com/glidea/zenfeed/pkg/util/time"
)
// --- Interface code block ---
@@ -45,10 +46,13 @@ type Config struct {
}
type App struct {
Timezone string `yaml:"timezone,omitempty" json:"timezone,omitempty" desc:"The timezone of the app. e.g. Asia/Shanghai. Default: server's local timezone"`
Log struct {
Level string `yaml:"level,omitempty" json:"level,omitempty" desc:"Log level, one of debug, info, warn, error. Default: info"`
} `yaml:"log,omitempty" json:"log,omitempty" desc:"The log config."`
Timezone string `yaml:"timezone,omitempty" json:"timezone,omitempty" desc:"The timezone of the app. e.g. Asia/Shanghai. Default: server's local timezone"`
Telemetry struct {
Address string `yaml:"address,omitempty" json:"address,omitempty" desc:"The address ([host]:port) of the telemetry server. e.g. 0.0.0.0:9090. Default: :9090. It can not be changed after the app is running."`
Log struct {
Level string `yaml:"level,omitempty" json:"level,omitempty" desc:"Log level, one of debug, info, warn, error. Default: info"`
} `yaml:"log,omitempty" json:"log,omitempty" desc:"The log config."`
} `yaml:"telemetry,omitempty" json:"telemetry,omitempty" desc:"The telemetry config."`
API struct {
HTTP struct {
Address string `yaml:"address,omitempty" json:"address,omitempty" desc:"The address ([host]:port) of the HTTP API. e.g. 0.0.0.0:1300. Default: :1300. It can not be changed after the app is running."`
@@ -56,9 +60,16 @@ type App struct {
MCP struct {
Address string `yaml:"address,omitempty" json:"address,omitempty" desc:"The address ([host]:port) of the MCP API. e.g. 0.0.0.0:1300. Default: :1301. It can not be changed after the app is running."`
} `yaml:"mcp,omitempty" json:"mcp,omitempty" desc:"The MCP API config."`
RSS struct {
Address string `yaml:"address,omitempty" json:"address,omitempty" desc:"The address ([host]:port) of the RSS API. e.g. 0.0.0.0:1300. Default: :1302. It can not be changed after the app is running."`
ContentHTMLTemplate string `yaml:"content_html_template,omitempty" json:"content_html_template,omitempty" desc:"The template to render the RSS content for each item. Default is {{ .summary_html_snippet }}."`
} `yaml:"rss,omitempty" json:"rss,omitempty" desc:"The RSS config."`
LLM string `yaml:"llm,omitempty" json:"llm,omitempty" desc:"The LLM name for summarizing feeds. e.g. my-favorite-gemini-king. Default is the default LLM in llms section."`
} `yaml:"api,omitempty" json:"api,omitempty" desc:"The API config."`
LLMs []LLM `yaml:"llms,omitempty" json:"llms,omitempty" desc:"The LLMs config. It is required, at least one LLM is needed, refered by other config sections."`
LLMs []LLM `yaml:"llms,omitempty" json:"llms,omitempty" desc:"The LLMs config. It is required, at least one LLM is needed, refered by other config sections."`
Jina struct {
Token string `yaml:"token,omitempty" json:"token,omitempty" desc:"The token of the Jina server."`
} `yaml:"jina,omitempty" json:"jina,omitempty" desc:"The Jina config."`
Scrape Scrape `yaml:"scrape,omitempty" json:"scrape,omitempty" desc:"The scrape config."`
Storage Storage `yaml:"storage,omitempty" json:"storage,omitempty" desc:"The storage config."`
Scheduls struct {
@@ -79,31 +90,42 @@ type LLM struct {
APIKey string `yaml:"api_key,omitempty" json:"api_key,omitempty" desc:"The API key of the LLM. It is required when api.llm is set."`
Model string `yaml:"model,omitempty" json:"model,omitempty" desc:"The model of the LLM. e.g. gpt-4o-mini. Can not be empty with embedding_model at same time when api.llm is set."`
EmbeddingModel string `yaml:"embedding_model,omitempty" json:"embedding_model,omitempty" desc:"The embedding model of the LLM. e.g. text-embedding-3-small. Can not be empty with model at same time when api.llm is set. NOTE: Once used, do not modify it directly, instead, add a new LLM configuration."`
TTSModel string `yaml:"tts_model,omitempty" json:"tts_model,omitempty" desc:"The TTS model of the LLM."`
Temperature float32 `yaml:"temperature,omitempty" json:"temperature,omitempty" desc:"The temperature (0-2) of the LLM. Default: 0.0"`
}
type Scrape struct {
Past time.Duration `yaml:"past,omitempty" json:"past,omitempty" desc:"The lookback time window for scraping feeds. e.g. 1h means only scrape feeds in the past 1 hour. Default: 3d"`
Interval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape each source, it is a global interval. e.g. 1h. Default: 1h"`
RSSHubEndpoint string `yaml:"rsshub_endpoint,omitempty" json:"rsshub_endpoint,omitempty" desc:"The endpoint of the RSSHub. You can deploy your own RSSHub server or use the public one (https://docs.rsshub.app/guide/instances). e.g. https://rsshub.app. It is required when sources[].rss.rsshub_route_path is set."`
Sources []ScrapeSource `yaml:"sources,omitempty" json:"sources,omitempty" desc:"The sources for scraping feeds."`
Past timeutil.Duration `yaml:"past,omitempty" json:"past,omitempty" desc:"The lookback time window for scraping feeds. e.g. 1h means only scrape feeds in the past 1 hour. Default: 3d"`
Interval timeutil.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape each source, it is a global interval. e.g. 1h. Default: 1h"`
RSSHubEndpoint string `yaml:"rsshub_endpoint,omitempty" json:"rsshub_endpoint,omitempty" desc:"The endpoint of the RSSHub. You can deploy your own RSSHub server or use the public one (https://docs.rsshub.app/guide/instances). e.g. https://rsshub.app. It is required when sources[].rss.rsshub_route_path is set."`
RSSHubAccessKey string `yaml:"rsshub_access_key,omitempty" json:"rsshub_access_key,omitempty" desc:"The access key for RSSHub. Used for access control. (see [RSSHub config](https://docs.rsshub.app/deploy/config#access-control-configurations))"`
Sources []ScrapeSource `yaml:"sources,omitempty" json:"sources,omitempty" desc:"The sources for scraping feeds."`
}
type Storage struct {
Dir string `yaml:"dir,omitempty" json:"dir,omitempty" desc:"The base directory of the all storages. Default: ./data. It can not be changed after the app is running."`
Feed FeedStorage `yaml:"feed,omitempty" json:"feed,omitempty" desc:"The feed storage config."`
Dir string `yaml:"dir,omitempty" json:"dir,omitempty" desc:"The base directory of the all storages. Default: ./data. It can not be changed after the app is running."`
Feed FeedStorage `yaml:"feed,omitempty" json:"feed,omitempty" desc:"The feed storage config."`
Object ObjectStorage `yaml:"object,omitempty" json:"object,omitempty" desc:"The object storage config."`
}
type FeedStorage struct {
Rewrites []RewriteRule `yaml:"rewrites,omitempty" json:"rewrites,omitempty" desc:"How to process each feed before storing it. It inspired by Prometheus relabeling (https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config), this implements a very strong flexibility and loose coupling."`
FlushInterval time.Duration `yaml:"flush_interval,omitempty" json:"flush_interval,omitempty" desc:"How often to flush the feed storage to the database, higher value will cause high data loss risk, but on the other hand, it will reduce the number of disk operations and improve performance. Default: 200ms"`
EmbeddingLLM string `yaml:"embedding_llm,omitempty" json:"embedding_llm,omitempty" desc:"The embedding LLM for the feed storage. It will significantly affect the accuracy of semantic search, please be careful to choose. If you want to switch, please note to keep the old llm configuration, because the past data is still implicitly associated with it, otherwise it will cause the past data to be unable to be semantically searched. Default is the default LLM in llms section."`
Retention time.Duration `yaml:"retention,omitempty" json:"retention,omitempty" desc:"How long to keep a feed. Default: 8d"`
BlockDuration time.Duration `yaml:"block_duration,omitempty" json:"block_duration,omitempty" desc:"How long to keep the feed storage block. Block is time-based, like Prometheus TSDB Block. Default: 25h"`
Rewrites []RewriteRule `yaml:"rewrites,omitempty" json:"rewrites,omitempty" desc:"How to process each feed before storing it. It inspired by Prometheus relabeling (https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config), this implements a very strong flexibility and loose coupling."`
FlushInterval timeutil.Duration `yaml:"flush_interval,omitempty" json:"flush_interval,omitempty" desc:"How often to flush the feed storage to the database, higher value will cause high data loss risk, but on the other hand, it will reduce the number of disk operations and improve performance. Default: 200ms"`
EmbeddingLLM string `yaml:"embedding_llm,omitempty" json:"embedding_llm,omitempty" desc:"The embedding LLM for the feed storage. It will significantly affect the accuracy of semantic search, please be careful to choose. If you want to switch, please note to keep the old llm configuration, because the past data is still implicitly associated with it, otherwise it will cause the past data to be unable to be semantically searched. Default is the default LLM in llms section."`
Retention timeutil.Duration `yaml:"retention,omitempty" json:"retention,omitempty" desc:"How long to keep a feed. Default: 8d"`
BlockDuration timeutil.Duration `yaml:"block_duration,omitempty" json:"block_duration,omitempty" desc:"How long to keep the feed storage block. Block is time-based, like Prometheus TSDB Block. Default: 25h"`
}
type ObjectStorage struct {
Endpoint string `yaml:"endpoint,omitempty" json:"endpoint,omitempty" desc:"The endpoint of the object storage."`
AccessKeyID string `yaml:"access_key_id,omitempty" json:"access_key_id,omitempty" desc:"The access key id of the object storage."`
SecretAccessKey string `yaml:"secret_access_key,omitempty" json:"secret_access_key,omitempty" desc:"The secret access key of the object storage."`
Bucket string `yaml:"bucket,omitempty" json:"bucket,omitempty" desc:"The bucket of the object storage."`
BucketURL string `yaml:"bucket_url,omitempty" json:"bucket_url,omitempty" desc:"The public URL of the object storage bucket."`
}
type ScrapeSource struct {
Interval time.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape this source. Default: global interval"`
Interval timeutil.Duration `yaml:"interval,omitempty" json:"interval,omitempty" desc:"How often to scrape this source. Default: global interval"`
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the source. It is required."`
Labels map[string]string `yaml:"labels,omitempty" json:"labels,omitempty" desc:"The additional labels to add to the feed of this source."`
RSS *ScrapeSourceRSS `yaml:"rss,omitempty" json:"rss,omitempty" desc:"The RSS config of the source."`
@@ -115,6 +137,7 @@ type ScrapeSourceRSS struct {
}
type RewriteRule struct {
If []string `yaml:"if,omitempty" json:"if,omitempty" desc:"The condition config to match the feed. If not set, that means match all feeds. Like label filters, e.g. [source=github, title!=xxx]"`
SourceLabel string `yaml:"source_label,omitempty" json:"source_label,omitempty" desc:"The feed label of the source text to transform. Default is the 'content' label. The feed is essentially a label set (similar to Prometheus metric data). The default labels are type (rss, email (in future), etc), source (the source name), title (feed title), link (feed link), pub_time (feed publish time), and content (feed content)."`
SkipTooShortThreshold *int `yaml:"skip_too_short_threshold,omitempty" json:"skip_too_short_threshold,omitempty" desc:"The threshold of the source text length to skip. Default is 300. It helps we to filter out some short feeds."`
Transform *RewriteRuleTransform `yaml:"transform,omitempty" json:"transform,omitempty" desc:"The transform config to transform the source text. If not set, that means transform nothing, so the source text is the transformed text."`
@@ -125,26 +148,46 @@ type RewriteRule struct {
}
type RewriteRuleTransform struct {
ToText *RewriteRuleTransformToText `yaml:"to_text,omitempty" json:"to_text,omitempty" desc:"The transform config to transform the source text to text."`
ToText *RewriteRuleTransformToText `yaml:"to_text,omitempty" json:"to_text,omitempty" desc:"The transform config to transform the source text to text."`
ToPodcast *RewriteRuleTransformToPodcast `yaml:"to_podcast,omitempty" json:"to_podcast,omitempty" desc:"The transform config to transform the source text to podcast."`
}
type RewriteRuleTransformToText struct {
Type string `yaml:"type,omitempty" json:"type,omitempty" desc:"The type of the transform. It can be one of prompt, crawl, crawl_by_jina. Default is prompt. For crawl, the source text will be as the url to crawl the page, and the page will be converted to markdown. crawl vs crawl_by_jina: crawl is local, more stable; crawl_by_jina is powered by https://jina.ai, more powerful."`
LLM string `yaml:"llm,omitempty" json:"llm,omitempty" desc:"The LLM name to use. Default is the default LLM in llms section."`
Prompt string `yaml:"prompt,omitempty" json:"prompt,omitempty" desc:"The prompt to transform the source text. The source text will be injected into the prompt above. And you can use go template syntax to refer some built-in prompts, like {{ .summary }}. Available built-in prompts: category, tags, score, comment_confucius, summary, summary_html_snippet."`
}
type RewriteRuleTransformToPodcast struct {
LLM string `yaml:"llm,omitempty" json:"llm,omitempty" desc:"The LLM name to use. Default is the default LLM in llms section."`
EstimateMaximumDuration timeutil.Duration `yaml:"estimate_maximum_duration,omitempty" json:"estimate_maximum_duration,omitempty" desc:"The estimated maximum duration of the podcast. It will affect the length of the generated transcript. e.g. 5m. Default is 5m."`
TranscriptAdditionalPrompt string `yaml:"transcript_additional_prompt,omitempty" json:"transcript_additional_prompt,omitempty" desc:"The additional prompt to add to the transcript. It is optional."`
TTSLLM string `yaml:"tts_llm,omitempty" json:"tts_llm,omitempty" desc:"The LLM name to use for TTS. Only supports gemini now. Default is the default LLM in llms section."`
Speakers []RewriteRuleTransformToPodcastSpeaker `yaml:"speakers,omitempty" json:"speakers,omitempty" desc:"The speakers to use. It is required, at least one speaker is needed."`
}
type RewriteRuleTransformToPodcastSpeaker struct {
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the speaker. It is required."`
Role string `yaml:"role,omitempty" json:"role,omitempty" desc:"The role description of the speaker. You can think of it as a character setting."`
Voice string `yaml:"voice,omitempty" json:"voice,omitempty" desc:"The voice of the speaker. It is required."`
}
type SchedulsRule struct {
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the rule. It is required."`
Query string `yaml:"query,omitempty" json:"query,omitempty" desc:"The semantic query to get the feeds. NOTE it is optional"`
Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty" desc:"The threshold to filter the query result by relevance (with 'query') score. It does not work when query is not set. Default is 0.6."`
LabelFilters []string `yaml:"label_filters,omitempty" json:"label_filters,omitempty" desc:"The label filters (equal or not equal) to match the feeds. e.g. [category=tech, source!=github]"`
EveryDay string `yaml:"every_day,omitempty" json:"every_day,omitempty" desc:"The query range at the end time of every day. Format: start~end, e.g. 00:00~23:59, or -22:00~7:00 (yesterday 22:00 to today 07:00)."`
WatchInterval time.Duration `yaml:"watch_interval,omitempty" json:"watch_interval,omitempty" desc:"The run and query interval to watch the rule. Default is 10m. It can not be set with every_day at same time."`
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the rule. It is required."`
Query string `yaml:"query,omitempty" json:"query,omitempty" desc:"The semantic query to get the feeds. NOTE it is optional"`
Threshold float32 `yaml:"threshold,omitempty" json:"threshold,omitempty" desc:"The threshold to filter the query result by relevance (with 'query') score. It does not work when query is not set. Default is 0.6."`
LabelFilters []string `yaml:"label_filters,omitempty" json:"label_filters,omitempty" desc:"The label filters (equal or not equal) to match the feeds. e.g. [category=tech, source!=github]"`
Labels map[string]string `yaml:"labels,omitempty" json:"labels,omitempty" desc:"The labels to attach to the feeds."`
EveryDay string `yaml:"every_day,omitempty" json:"every_day,omitempty" desc:"The query range at the end time of every day. Format: start~end, e.g. 00:00~23:59, or -22:00~7:00 (yesterday 22:00 to today 07:00)."`
WatchInterval timeutil.Duration `yaml:"watch_interval,omitempty" json:"watch_interval,omitempty" desc:"The run and query interval to watch the rule. Default is 10m. It can not be set with every_day at same time."`
}
type NotifyRoute struct {
Receivers []string `yaml:"receivers,omitempty" json:"receivers,omitempty" desc:"The notify receivers. It is required, at least one receiver is needed."`
GroupBy []string `yaml:"group_by,omitempty" json:"group_by,omitempty" desc:"The group by config to group the feeds, each group will be notified individually. It is required, at least one group by is needed."`
SourceLabel string `yaml:"source_label,omitempty" json:"source_label,omitempty" desc:"The source label to extract the content from each feed, and summarize them. Default are all labels. It is very recommended to set it to 'summary' to reduce context length."`
SummaryPrompt string `yaml:"summary_prompt,omitempty" json:"summary_prompt,omitempty" desc:"The prompt to summarize the feeds of each group."`
LLM string `yaml:"llm,omitempty" json:"llm,omitempty" desc:"The LLM name to use. Default is the default LLM in llms section. A large context length LLM is recommended."`
CompressByRelatedThreshold *float32 `yaml:"compress_by_related_threshold,omitempty" json:"compress_by_related_threshold,omitempty" desc:"The threshold to compress the feeds by relatedness, that is, if the feeds are too similar, only one will be notified. Default is 0.85."`
SubRoutes []NotifySubRoute `yaml:"sub_routes,omitempty" json:"sub_routes,omitempty" desc:"The sub routes to notify the feeds. A feed prefers to be matched by the sub routes, if not matched, it will be matched by the parent route."`
}
@@ -154,20 +197,22 @@ type NotifySubRoute struct {
Receivers []string `yaml:"receivers,omitempty" json:"receivers,omitempty" desc:"The notify receivers. It is required, at least one receiver is needed."`
GroupBy []string `yaml:"group_by,omitempty" json:"group_by,omitempty" desc:"The group by config to group the feeds, each group will be notified individually. It is required, at least one group by is needed."`
SourceLabel string `yaml:"source_label,omitempty" json:"source_label,omitempty" desc:"The source label to extract the content from each feed, and summarize them. Default are all labels. It is very recommended to set it to 'summary' to reduce context length."`
SummaryPrompt string `yaml:"summary_prompt,omitempty" json:"summary_prompt,omitempty" desc:"The prompt to summarize the feeds of each group."`
LLM string `yaml:"llm,omitempty" json:"llm,omitempty" desc:"The LLM name to use. Default is the default LLM in llms section. A large context length LLM is recommended."`
CompressByRelatedThreshold *float32 `yaml:"compress_by_related_threshold,omitempty" json:"compress_by_related_threshold,omitempty" desc:"The threshold to compress the feeds by relatedness, that is, if the feeds are too similar, only one will be notified. Default is 0.85."`
SubRoutes []NotifySubRoute `yaml:"sub_routes,omitempty" json:"sub_routes,omitempty" desc:"The sub routes to notify the feeds. A feed prefers to be matched by the sub routes, if not matched, it will be matched by the parent route."`
}
type NotifyReceiver struct {
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the receiver. It is required."`
Email string `yaml:"email,omitempty" json:"email,omitempty" desc:"The email of the receiver."`
// TODO: to reduce copyright risk, we do not support webhook receiver now.
// Webhook *NotifyReceiverWebhook `yaml:"webhook" json:"webhook" desc:"The webhook of the receiver."`
Name string `yaml:"name,omitempty" json:"name,omitempty" desc:"The name of the receiver. It is required."`
Email string `yaml:"email,omitempty" json:"email,omitempty" desc:"The email of the receiver."`
Webhook *NotifyReceiverWebhook `yaml:"webhook" json:"webhook" desc:"The webhook of the receiver."`
}
// type NotifyReceiverWebhook struct {
// URL string `yaml:"url"`
// }
type NotifyReceiverWebhook struct {
URL string `yaml:"url"`
}
type NotifyChannels struct {
Email *NotifyChannelEmail `yaml:"email,omitempty" json:"email,omitempty" desc:"The global email channel config."`

248
pkg/llm/gemini.go Normal file
View File

@@ -0,0 +1,248 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package llm
import (
"bytes"
"context"
"encoding/base64"
"encoding/json"
"io"
"net/http"
"path/filepath"
"github.com/pkg/errors"
oai "github.com/sashabaranov/go-openai"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
"github.com/glidea/zenfeed/pkg/util/wav"
)
type gemini struct {
*component.Base[Config, struct{}]
text
hc *http.Client
embeddingSpliter embeddingSpliter
}
func newGemini(c *Config) LLM {
config := oai.DefaultConfig(c.APIKey)
config.BaseURL = filepath.Join(c.Endpoint, "openai") // OpenAI compatible endpoint.
client := oai.NewClientWithConfig(config)
embeddingSpliter := newEmbeddingSpliter(1536, 64)
base := component.New(&component.BaseConfig[Config, struct{}]{
Name: "LLM/gemini",
Instance: c.Name,
Config: c,
})
return &gemini{
Base: base,
text: &openaiText{
Base: base,
client: client,
},
hc: &http.Client{},
embeddingSpliter: embeddingSpliter,
}
}
func (g *gemini) WAV(ctx context.Context, text string, speakers []Speaker) (r io.ReadCloser, err error) {
ctx = telemetry.StartWith(ctx, append(g.TelemetryLabels(), telemetrymodel.KeyOperation, "WAV")...)
defer func() { telemetry.End(ctx, err) }()
if g.Config().TTSModel == "" {
return nil, errors.New("tts model is not set")
}
reqPayload, err := buildWAVRequestPayload(text, speakers)
if err != nil {
return nil, errors.Wrap(err, "build wav request payload")
}
pcmData, err := g.doWAVRequest(ctx, reqPayload)
if err != nil {
return nil, errors.Wrap(err, "do wav request")
}
return streamWAV(pcmData), nil
}
func (g *gemini) doWAVRequest(ctx context.Context, reqPayload *geminiRequest) ([]byte, error) {
config := g.Config()
body, err := json.Marshal(reqPayload)
if err != nil {
return nil, errors.Wrap(err, "marshal tts request")
}
url := config.Endpoint + "/models/" + config.TTSModel + ":generateContent"
req, err := http.NewRequestWithContext(ctx, http.MethodPost, url, bytes.NewReader(body))
if err != nil {
return nil, errors.Wrap(err, "new tts request")
}
req.Header.Set("x-goog-api-key", config.APIKey)
req.Header.Set("Content-Type", "application/json")
resp, err := g.hc.Do(req)
if err != nil {
return nil, errors.Wrap(err, "do tts request")
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
errMsg, _ := io.ReadAll(resp.Body)
return nil, errors.Errorf("tts request failed with status %d: %s", resp.StatusCode, string(errMsg))
}
var ttsResp geminiResponse
if err := json.NewDecoder(resp.Body).Decode(&ttsResp); err != nil {
return nil, errors.Wrap(err, "decode tts response")
}
if len(ttsResp.Candidates) == 0 || len(ttsResp.Candidates[0].Content.Parts) == 0 || ttsResp.Candidates[0].Content.Parts[0].InlineData == nil {
return nil, errors.New("no audio data in tts response")
}
audioDataB64 := ttsResp.Candidates[0].Content.Parts[0].InlineData.Data
pcmData, err := base64.StdEncoding.DecodeString(audioDataB64)
if err != nil {
return nil, errors.Wrap(err, "decode base64")
}
return pcmData, nil
}
func buildWAVRequestPayload(text string, speakers []Speaker) (*geminiRequest, error) {
reqPayload := geminiRequest{
Contents: []*geminiRequestContent{{Parts: []*geminiRequestPart{{Text: text}}}},
Config: &geminiRequestConfig{
ResponseModalities: []string{"AUDIO"},
SpeechConfig: &geminiRequestSpeechConfig{},
},
}
switch len(speakers) {
case 0:
return nil, errors.New("no speakers")
case 1:
reqPayload.Config.SpeechConfig.VoiceConfig = &geminiRequestVoiceConfig{
PrebuiltVoiceConfig: &geminiRequestPrebuiltVoiceConfig{VoiceName: speakers[0].Voice},
}
default:
multiSpeakerConfig := &geminiRequestMultiSpeakerVoiceConfig{}
for _, s := range speakers {
multiSpeakerConfig.SpeakerVoiceConfigs = append(multiSpeakerConfig.SpeakerVoiceConfigs, &geminiRequestSpeakerVoiceConfig{
Speaker: s.Name,
VoiceConfig: &geminiRequestVoiceConfig{
PrebuiltVoiceConfig: &geminiRequestPrebuiltVoiceConfig{VoiceName: s.Voice},
},
})
}
reqPayload.Config.SpeechConfig.MultiSpeakerVoiceConfig = multiSpeakerConfig
}
return &reqPayload, nil
}
func streamWAV(pcmData []byte) io.ReadCloser {
pipeReader, pipeWriter := io.Pipe()
go func() {
defer func() { _ = pipeWriter.Close() }()
if err := wav.WriteHeader(pipeWriter, geminiWavHeader, uint32(len(pcmData))); err != nil {
pipeWriter.CloseWithError(errors.Wrap(err, "write wav header"))
return
}
if _, err := io.Copy(pipeWriter, bytes.NewReader(pcmData)); err != nil {
pipeWriter.CloseWithError(errors.Wrap(err, "write pcm data"))
return
}
}()
return pipeReader
}
var geminiWavHeader = &wav.Header{
SampleRate: 24000,
BitDepth: 16,
NumChannels: 1,
}
type geminiRequest struct {
Contents []*geminiRequestContent `json:"contents"`
Config *geminiRequestConfig `json:"generationConfig"`
}
type geminiRequestContent struct {
Parts []*geminiRequestPart `json:"parts"`
}
type geminiRequestPart struct {
Text string `json:"text"`
}
type geminiRequestConfig struct {
ResponseModalities []string `json:"responseModalities"`
SpeechConfig *geminiRequestSpeechConfig `json:"speechConfig"`
}
type geminiRequestSpeechConfig struct {
VoiceConfig *geminiRequestVoiceConfig `json:"voiceConfig,omitempty"`
MultiSpeakerVoiceConfig *geminiRequestMultiSpeakerVoiceConfig `json:"multiSpeakerVoiceConfig,omitempty"`
}
type geminiRequestVoiceConfig struct {
PrebuiltVoiceConfig *geminiRequestPrebuiltVoiceConfig `json:"prebuiltVoiceConfig,omitempty"`
}
type geminiRequestPrebuiltVoiceConfig struct {
VoiceName string `json:"voiceName,omitempty"`
}
type geminiRequestMultiSpeakerVoiceConfig struct {
SpeakerVoiceConfigs []*geminiRequestSpeakerVoiceConfig `json:"speakerVoiceConfigs,omitempty"`
}
type geminiRequestSpeakerVoiceConfig struct {
Speaker string `json:"speaker,omitempty"`
VoiceConfig *geminiRequestVoiceConfig `json:"voiceConfig,omitempty"`
}
type geminiResponse struct {
Candidates []*geminiResponseCandidate `json:"candidates"`
}
type geminiResponseCandidate struct {
Content *geminiResponseContent `json:"content"`
}
type geminiResponseContent struct {
Parts []*geminiResponsePart `json:"parts"`
}
type geminiResponsePart struct {
InlineData *geminiResponseInlineData `json:"inlineData"`
}
type geminiResponseInlineData struct {
MimeType string `json:"mimeType"`
Data string `json:"data"` // Base64 encoded.
}

View File

@@ -16,9 +16,12 @@
package llm
import (
"bytes"
"context"
"io"
"reflect"
"strconv"
"strings"
"sync"
"time"
@@ -33,25 +36,41 @@ import (
"github.com/glidea/zenfeed/pkg/storage/kv"
"github.com/glidea/zenfeed/pkg/telemetry/log"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
binaryutil "github.com/glidea/zenfeed/pkg/util/binary"
"github.com/glidea/zenfeed/pkg/util/buffer"
"github.com/glidea/zenfeed/pkg/util/hash"
)
// --- Interface code block ---
type LLM interface {
component.Component
text
audio
}
type text interface {
String(ctx context.Context, messages []string) (string, error)
EmbeddingLabels(ctx context.Context, labels model.Labels) ([][]float32, error)
Embedding(ctx context.Context, text string) ([]float32, error)
}
type audio interface {
WAV(ctx context.Context, text string, speakers []Speaker) (io.ReadCloser, error)
}
type Speaker struct {
Name string
Voice string
}
type Config struct {
Name string
Default bool
Provider ProviderType
Endpoint string
APIKey string
Model, EmbeddingModel string
Temperature float32
Name string
Default bool
Provider ProviderType
Endpoint string
APIKey string
Model, EmbeddingModel, TTSModel string
Temperature float32
}
type ProviderType string
@@ -69,7 +88,7 @@ var defaultEndpoints = map[ProviderType]string{
ProviderTypeOpenAI: "https://api.openai.com/v1",
ProviderTypeOpenRouter: "https://openrouter.ai/api/v1",
ProviderTypeDeepSeek: "https://api.deepseek.com/v1",
ProviderTypeGemini: "https://generativelanguage.googleapis.com/v1beta/openai",
ProviderTypeGemini: "https://generativelanguage.googleapis.com/v1beta",
ProviderTypeVolc: "https://ark.cn-beijing.volces.com/api/v3",
ProviderTypeSiliconFlow: "https://api.siliconflow.cn/v1",
}
@@ -94,8 +113,8 @@ func (c *Config) Validate() error { //nolint:cyclop
if c.APIKey == "" {
return errors.New("api key is required")
}
if c.Model == "" && c.EmbeddingModel == "" {
return errors.New("model or embedding model is required")
if c.Model == "" && c.EmbeddingModel == "" && c.TTSModel == "" {
return errors.New("model or embedding model or tts model is required")
}
if c.Temperature < 0 || c.Temperature > 2 {
return errors.Errorf("invalid temperature: %f, should be in range [0, 2]", c.Temperature)
@@ -179,6 +198,7 @@ func (c *FactoryConfig) From(app *config.App) {
APIKey: llm.APIKey,
Model: llm.Model,
EmbeddingModel: llm.EmbeddingModel,
TTSModel: llm.TTSModel,
Temperature: llm.Temperature,
})
}
@@ -204,12 +224,9 @@ func NewFactory(
) (Factory, error) {
if len(mockOn) > 0 {
mf := &mockFactory{}
getCall := mf.On("Get", mock.Anything)
getCall.Run(func(args mock.Arguments) {
m := &mockLLM{}
component.MockOptions(mockOn).Apply(&m.Mock)
getCall.Return(m, nil)
})
m := &mockLLM{}
component.MockOptions(mockOn).Apply(&m.Mock)
mf.On("Get", mock.Anything).Return(m)
mf.On("Reload", mock.Anything).Return(nil)
return mf, nil
@@ -304,11 +321,6 @@ func (f *factory) Get(name string) LLM {
continue
}
if f.llms[name] == nil {
llm := f.new(&llmC)
f.llms[name] = llm
}
return f.llms[name]
}
@@ -317,8 +329,12 @@ func (f *factory) Get(name string) LLM {
func (f *factory) new(c *Config) LLM {
switch c.Provider {
case ProviderTypeOpenAI, ProviderTypeOpenRouter, ProviderTypeDeepSeek, ProviderTypeGemini, ProviderTypeVolc, ProviderTypeSiliconFlow: //nolint:lll
case ProviderTypeOpenAI, ProviderTypeOpenRouter, ProviderTypeDeepSeek, ProviderTypeVolc, ProviderTypeSiliconFlow: //nolint:lll
return newCached(newOpenAI(c), f.Dependencies().KVStorage)
case ProviderTypeGemini:
return newCached(newGemini(c), f.Dependencies().KVStorage)
default:
return newCached(newOpenAI(c), f.Dependencies().KVStorage)
}
@@ -330,14 +346,17 @@ func (f *factory) initLLMs() {
llms = make(map[string]LLM, len(config.LLMs))
defaultLLM LLM
)
for _, llmC := range config.LLMs {
llm := f.new(&llmC)
llms[llmC.Name] = llm
if llmC.Name == config.defaultLLM {
defaultLLM = llm
}
}
f.llms = llms
f.defaultLLM = defaultLLM
}
@@ -373,24 +392,97 @@ func newCached(llm LLM, kvStorage kv.Storage) LLM {
func (c *cached) String(ctx context.Context, messages []string) (string, error) {
key := hash.Sum64s(messages)
keyStr := strconv.FormatUint(key, 10)
keyStr := strconv.FormatUint(key, 10) // for human readable & compatible.
value, err := c.kvStorage.Get(ctx, keyStr)
valueBs, err := c.kvStorage.Get(ctx, []byte(keyStr))
switch {
case err == nil:
return value, nil
return string(valueBs), nil
case errors.Is(err, kv.ErrNotFound):
break
default:
return "", errors.Wrap(err, "get from kv storage")
}
value, err = c.LLM.String(ctx, messages)
value, err := c.LLM.String(ctx, messages)
if err != nil {
return "", err
}
if strings.Trim(value, " \n\r\t") == "" {
return "", errors.New("empty response") // Gemini may occur this.
}
if err = c.kvStorage.Set(ctx, keyStr, value, 65*time.Minute); err != nil {
// TODO: reduce copies.
if err = c.kvStorage.Set(ctx, []byte(keyStr), []byte(value), 65*time.Minute); err != nil {
log.Error(ctx, err, "set to kv storage")
}
return value, nil
}
var (
toBytes = func(v []float32) ([]byte, error) {
buf := buffer.Get()
defer buffer.Put(buf)
for _, fVal := range v {
if err := binaryutil.WriteFloat32(buf, fVal); err != nil {
return nil, errors.Wrap(err, "write float32")
}
}
// Must copy data, as the buffer will be reused.
bs := make([]byte, buf.Len())
copy(bs, buf.Bytes())
return bs, nil
}
toF32s = func(bs []byte) ([]float32, error) {
if len(bs)%4 != 0 {
return nil, errors.New("embedding data is corrupted, length not multiple of 4")
}
r := bytes.NewReader(bs)
floats := make([]float32, len(bs)/4)
for i := range floats {
f, err := binaryutil.ReadFloat32(r)
if err != nil {
return nil, errors.Wrap(err, "deserialize float32")
}
floats[i] = f
}
return floats, nil
}
)
func (c *cached) Embedding(ctx context.Context, text string) ([]float32, error) {
key := hash.Sum64(text)
keyStr := strconv.FormatUint(key, 10)
valueBs, err := c.kvStorage.Get(ctx, []byte(keyStr))
switch {
case err == nil:
return toF32s(valueBs)
case errors.Is(err, kv.ErrNotFound):
break
default:
return nil, errors.Wrap(err, "get from kv storage")
}
value, err := c.LLM.Embedding(ctx, text)
if err != nil {
return nil, err
}
valueBs, err = toBytes(value)
if err != nil {
return nil, errors.Wrap(err, "serialize embedding")
}
if err = c.kvStorage.Set(ctx, []byte(keyStr), valueBs, 65*time.Minute); err != nil {
log.Error(ctx, err, "set to kv storage")
}
@@ -409,12 +501,27 @@ func (m *mockLLM) String(ctx context.Context, messages []string) (string, error)
func (m *mockLLM) EmbeddingLabels(ctx context.Context, labels model.Labels) ([][]float32, error) {
args := m.Called(ctx, labels)
if args.Error(1) != nil {
return nil, args.Error(1)
}
return args.Get(0).([][]float32), args.Error(1)
}
func (m *mockLLM) Embedding(ctx context.Context, text string) ([]float32, error) {
args := m.Called(ctx, text)
if args.Error(1) != nil {
return nil, args.Error(1)
}
return args.Get(0).([]float32), args.Error(1)
}
func (m *mockLLM) WAV(ctx context.Context, text string, speakers []Speaker) (io.ReadCloser, error) {
args := m.Called(ctx, text, speakers)
if args.Error(1) != nil {
return nil, args.Error(1)
}
return args.Get(0).(io.ReadCloser), args.Error(1)
}

View File

@@ -18,6 +18,7 @@ package llm
import (
"context"
"encoding/json"
"io"
"github.com/pkg/errors"
oai "github.com/sashabaranov/go-openai"
@@ -31,29 +32,43 @@ import (
type openai struct {
*component.Base[Config, struct{}]
client *oai.Client
embeddingSpliter embeddingSpliter
text
}
func newOpenAI(c *Config) LLM {
config := oai.DefaultConfig(c.APIKey)
config.BaseURL = c.Endpoint
client := oai.NewClientWithConfig(config)
embeddingSpliter := newEmbeddingSpliter(2048, 64)
embeddingSpliter := newEmbeddingSpliter(1536, 64)
base := component.New(&component.BaseConfig[Config, struct{}]{
Name: "LLM/openai",
Instance: c.Name,
Config: c,
})
return &openai{
Base: component.New(&component.BaseConfig[Config, struct{}]{
Name: "LLM/openai",
Instance: c.Name,
Config: c,
}),
client: client,
embeddingSpliter: embeddingSpliter,
Base: base,
text: &openaiText{
Base: base,
client: client,
embeddingSpliter: embeddingSpliter,
},
}
}
func (o *openai) String(ctx context.Context, messages []string) (value string, err error) {
func (o *openai) WAV(ctx context.Context, text string, speakers []Speaker) (r io.ReadCloser, err error) {
return nil, errors.New("not supported")
}
type openaiText struct {
*component.Base[Config, struct{}]
client *oai.Client
embeddingSpliter embeddingSpliter
}
func (o *openaiText) String(ctx context.Context, messages []string) (value string, err error) {
ctx = telemetry.StartWith(ctx, append(o.TelemetryLabels(), telemetrymodel.KeyOperation, "String")...)
defer func() { telemetry.End(ctx, err) }()
@@ -61,9 +76,9 @@ func (o *openai) String(ctx context.Context, messages []string) (value string, e
if config.Model == "" {
return "", errors.New("model is not set")
}
msg := make([]oai.ChatCompletionMessage, 0, len(messages))
msgs := make([]oai.ChatCompletionMessage, 0, len(messages))
for _, m := range messages {
msg = append(msg, oai.ChatCompletionMessage{
msgs = append(msgs, oai.ChatCompletionMessage{
Role: oai.ChatMessageRoleUser,
Content: m,
})
@@ -71,7 +86,7 @@ func (o *openai) String(ctx context.Context, messages []string) (value string, e
req := oai.ChatCompletionRequest{
Model: config.Model,
Messages: msg,
Messages: msgs,
Temperature: config.Temperature,
}
@@ -91,7 +106,7 @@ func (o *openai) String(ctx context.Context, messages []string) (value string, e
return resp.Choices[0].Message.Content, nil
}
func (o *openai) EmbeddingLabels(ctx context.Context, labels model.Labels) (value [][]float32, err error) {
func (o *openaiText) EmbeddingLabels(ctx context.Context, labels model.Labels) (value [][]float32, err error) {
ctx = telemetry.StartWith(ctx, append(o.TelemetryLabels(), telemetrymodel.KeyOperation, "EmbeddingLabels")...)
defer func() { telemetry.End(ctx, err) }()
@@ -117,7 +132,7 @@ func (o *openai) EmbeddingLabels(ctx context.Context, labels model.Labels) (valu
return vecs, nil
}
func (o *openai) Embedding(ctx context.Context, s string) (value []float32, err error) {
func (o *openaiText) Embedding(ctx context.Context, s string) (value []float32, err error) {
ctx = telemetry.StartWith(ctx, append(o.TelemetryLabels(), telemetrymodel.KeyOperation, "Embedding")...)
defer func() { telemetry.End(ctx, err) }()
@@ -141,6 +156,6 @@ func (o *openai) Embedding(ctx context.Context, s string) (value []float32, err
promptTokens.WithLabelValues(lvs...).Add(float64(vec.Usage.PromptTokens))
completionTokens.WithLabelValues(lvs...).Add(float64(vec.Usage.CompletionTokens))
totalTokens.WithLabelValues(lvs...).Add(float64(vec.Usage.TotalTokens))
return vec.Data[0].Embedding, nil
}

229
pkg/llm/prompt/prompt.go Normal file
View File

@@ -0,0 +1,229 @@
package prompt
var Builtin = map[string]string{
"category": `
Analyze the content and categorize it into exactly one of these categories:
Technology, Development, Entertainment, Finance, Health, Politics, Other
Classification requirements:
- Choose the SINGLE most appropriate category based on:
* Primary topic and main focus of the content
* Key terminology and concepts used
* Target audience and purpose
* Technical depth and complexity level
- For content that could fit multiple categories:
* Identify the dominant theme
* Consider the most specific applicable category
* Use the primary intended purpose
- If content appears ambiguous:
* Focus on the most prominent aspects
* Consider the practical application
* Choose the category that best serves user needs
Output format:
Return ONLY the category name, no other text or explanation.
Must be one of the provided categories exactly as written.
`,
"tags": `
Analyze the content and add appropriate tags based on:
- Main topics and themes
- Key concepts and terminology
- Target audience and purpose
- Technical depth and domain
- 2-4 tags are enough
Output format:
Return a list of tags, separated by commas, no other text or explanation.
e.g. "AI, Technology, Innovation, Future"
`,
"score": `
Please give a score between 0 and 10 based on the following content.
Evaluate the content comprehensively considering clarity, accuracy, depth, logical structure, language expression, and completeness.
Note: If the content is an article or a text intended to be detailed, the length is an important factor. Generally, content under 300 words may receive a lower score due to lack of substance, unless its type (such as poetry or summary) is inherently suitable for brevity.
Output format:
Return the score (0-10), no other text or explanation.
E.g. "8", "5", "3", etc.
`,
"comment_confucius": `
Please act as Confucius and write a 100-word comment on the article.
Content needs to be in line with the Chinese mainland's regulations.
Output format:
Return the comment only, no other text or explanation.
Reply short and concise, 100 words is enough.
`,
"summary": `
Please read the article carefully and summarize its core content in the format of [Choice: Key Point List / Concise Paragraph]. The summary should clearly cover:
1. What is the main topic/theme of the article?
2. What key arguments/main information did the author put forward?
3. (Optional, if the article contains) What important data, cases, or examples are there?
4. What main conclusions did the article reach or what core information did it ultimately convey?
Strive for comprehensive, accurate, and concise.
`,
"summary_html_snippet": `
You are to act as a professional Content Designer. Your task is to convert the provided article into **visually modern HTML email snippets** that render well in modern email clients like Gmail and QQ Mail.
**Core Requirements:**
* **Highlighting and Layout Techniques (Based on the article content, you must actually use the HTML structure templates provided below to generate the content. It is not necessary to use all of them; choose the ones that best fit the content.):**
*. **Standard Paragraph** (Required) (This is your primary tool. Use it for introductions, conclusions, and to connect different visual elements to build a cohesive narrative.):
<p style="margin:16px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; line-height:1.75; color:#3c4043;">
Insert your main text, explanations, or transitional sentences here.
</p>
*. **Key Points List** (Required) (for organizing multiple core points):
<ul style="margin:20px 0; padding-left:0; list-style-type:none;">
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">1</span>
Description of the first key point
</li>
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">2</span>
Description of the second key point
</li>
</ul>
*. **Emphasized Text** (Required!!) (for highlighting keywords or phrases):
<span style="background:linear-gradient(180deg, rgba(255,255,255,0) 50%, rgba(66,133,244,0.2) 50%); padding:0 2px;">Text to be emphasized</span>
*. **Stylish Quote Block** (Optional) (for highlighting important points or direct quotes from the original text):
<div style="margin:20px 0; padding:20px; background:linear-gradient(to right, #f8f9fa, #ffffff); border-left:5px solid #4285f4; border-radius:5px; box-shadow:0 2px 8px rgba(0,0,0,0.05);">
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; line-height:1.6; color:#333; font-weight:500;">
Insert the key point or finding to be highlighted here.
</p>
</div>
*. **Image Block** (Optional) (Embed images from the article where appropriate to aid explanation. Remember to use referrerpolicy="no-referrer" to ensure they display correctly):
<div style="margin:20px 0; text-align:center;">
<img src="URL_of_the_image_from_article" alt="Image description from article" style="max-width:100%; height:auto; border-radius:8px; box-shadow:0 4px 12px rgba(0,0,0,0.1);" referrerpolicy="no-referrer">
</div>
*. **Information Card** (Optional) (for highlighting key data/metrics):
<div style="display:inline-block; margin:10px 10px 10px 0; padding:15px 20px; background-color:#ffffff; border-radius:8px; box-shadow:0 3px 10px rgba(0,0,0,0.08); min-width:120px; text-align:center;">
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#666;">Metric Name</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:24px; font-weight:600; color:#1a73e8;">75%</p>
</div>
*. **Comparison Table** (Optional) (suitable for comparing different solutions or viewpoints based on the article content):
<div style="margin:25px 0; padding:15px; background-color:#f8f9fa; border-radius:8px; overflow-x:auto;">
<table style="width:100%; border-collapse:collapse; font-family:'Google Sans',Roboto,Arial,sans-serif;">
<thead>
<tr>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Feature</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option A</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option B</th>
</tr>
</thead>
<tbody>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Cost</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Higher</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Moderate</td>
</tr>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Efficiency</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Very High</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Average</td>
</tr>
</tbody>
</table>
* **Output Requirements:**
* The design should be **aesthetically pleasing and elegant, with harmonious color schemes**, ensuring sufficient **whitespace and contrast**.
* All article snippets must maintain a **consistent visual style**.
* You **must use multiple visual elements** and avoid mere text listings. **Use at least 2-3 different visual elements** to enhance readability and intuitive understanding.
* **Weave these components together with plain text.** They are not meant to be isolated blocks. Use transitional text to connect them, ensuring a smooth and logical reading experience.
* **Appropriately quote important original text snippets** to support explanations.
* **Strive to use highlighting styles to mark key points**.
* **Ensure overall reading flow is smooth and natural!!!** Guide the reader's thought process appropriately, minimizing abrupt jumps in logic.
* **Output only the HTML code snippet.** Do not include the full HTML document structure (i.e., no <html>, <head>, or <body> tags).
* **Do not add any explanatory text, extra comments, Markdown formatting, or HTML backticks.** Output the raw HTML code directly.
* **Do not add article titles or sources;** these will be automatically injected by the user later.
* **Do not use any opening remarks or pleasantries** (e.g., "Hi," "Let's talk about..."). Directly present the processed HTML content.
* **Do not refer to "this article," "this piece," "the current text," etc.** The user is aware of this context.
* **Only use inline styles, do not use global styles.** Remember to only generate HTML snippets.
* Do not explain anything, just output the HTML code snippet.
* Use above HTML components & its styles to generate the HTML code snippet, do not customize by yourself, else you will be fired.
* **Your Personality and Expression Preferences:**
* ** Have a strong aversion to jargon, bureaucratic language, redundant embellishments, and grand narratives. Believe that plain, simple language can best convey truth.
* Be fluent, plain, concise, and not verbose.
* Be **plain, direct, clear, and easy to understand:** Use basic vocabulary and simple sentence structures. Avoid "sophisticated" complex sentences or unnecessary embellishments that increase reading burden.
* Enable readers to quickly grasp: "What is this? What is it generally about? What is its relevance/real-world significance to me (an ordinary person)?" Focus on providing an **overview**, not an accumulation of details.
* Be well-versed in cognitive science; understand how to phrase information so that someone without prior background can quickly understand the core content.
* **Extract key information and core insights,** rather than directly copying the original text. Do not omit crucial information and viewpoints. For example, for forum posts, the main points from comments are also very important!
* Avoid large blocks of text, strive for a combination of pictures and text.
`,
"summary_html_snippet_for_small_model": `
You are to act as a professional Content Designer. Your task is to convert the provided article into **visually modern HTML email snippets** that render well in modern email clients like Gmail and QQ Mail.
**Core Requirements:**
* **Highlighting and Layout Techniques (Based on the article content, you must actually use the HTML structure templates provided below to generate the content. It is not necessary to use all of them; choose the ones that best fit the content.):**
*. **Standard Paragraph** (Required) (This is your primary tool. Use it for introductions, conclusions, and to connect different visual elements to build a cohesive narrative.):
<p style="margin:16px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; line-height:1.75; color:#3c4043;">
Insert your main text, explanations, or transitional sentences here.
</p>
*. **Key Points List** (Required) (for organizing multiple core points):
<ul style="margin:20px 0; padding-left:0; list-style-type:none;">
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">1</span>
Description of the first key point
</li>
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">2</span>
Description of the second key point
</li>
</ul>
*. **Emphasized Text** (Required!!) (for highlighting keywords or phrases):
<span style="background:linear-gradient(180deg, rgba(255,255,255,0) 50%, rgba(66,133,244,0.2) 50%); padding:0 2px;">Text to be emphasized</span>
*. **Stylish Quote Block** (Optional) (for highlighting important points or direct quotes from the original text):
<div style="margin:20px 0; padding:20px; background:linear-gradient(to right, #f8f9fa, #ffffff); border-left:5px solid #4285f4; border-radius:5px; box-shadow:0 2px 8px rgba(0,0,0,0.05);">
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; line-height:1.6; color:#333; font-weight:500;">
Insert the key point or finding to be highlighted here.
</p>
</div>
*. **Image Block** (Optional) (Embed images from the article where appropriate to aid explanation. Remember to use referrerpolicy="no-referrer" to ensure they display correctly):
<div style="margin:20px 0; text-align:center;">
<img src="URL_of_the_image_from_article" alt="Image description from article" style="max-width:100%; height:auto; border-radius:8px; box-shadow:0 4px 12px rgba(0,0,0,0.1);" referrerpolicy="no-referrer">
</div>
* **Output Requirements:**
* The design should be **aesthetically pleasing and elegant, with harmonious color schemes**, ensuring sufficient **whitespace and contrast**.
* All article snippets must maintain a **consistent visual style**.
* You **must use multiple visual elements** and avoid mere text listings. **Use at least 2-3 different visual elements** to enhance readability and intuitive understanding.
* **Weave these components together with plain text.** They are not meant to be isolated blocks. Use transitional text to connect them, ensuring a smooth and logical reading experience.
* **Appropriately quote important original text snippets** to support explanations.
* **Strive to use highlighting styles to mark key points**.
* **Ensure overall reading flow is smooth and natural!!!** Guide the reader's thought process appropriately, minimizing abrupt jumps in logic.
* **Output only the HTML code snippet.** Do not include the full HTML document structure (i.e., no <html>, <head>, or <body> tags).
* **Do not add any explanatory text, extra comments, Markdown formatting, or HTML backticks.** Output the raw HTML code directly.
* **Do not add article titles or sources;** these will be automatically injected by the user later.
* **Do not use any opening remarks or pleasantries** (e.g., "Hi," "Let's talk about..."). Directly present the processed HTML content.
* **Do not refer to "this article," "this piece," "the current text," etc.** The user is aware of this context.
* **Only use inline styles, do not use global styles.** Remember to only generate HTML snippets.
* Do not explain anything, just output the HTML code snippet.
* Use above HTML components & its styles to generate the HTML code snippet, do not customize by yourself, else you will be fired.
* **Your Personality and Expression Preferences:**
* ** Have a strong aversion to jargon, bureaucratic language, redundant embellishments, and grand narratives. Believe that plain, simple language can best convey truth.
* Be fluent, plain, concise, and not verbose.
* Be **plain, direct, clear, and easy to understand:** Use basic vocabulary and simple sentence structures. Avoid "sophisticated" complex sentences or unnecessary embellishments that increase reading burden.
* Enable readers to quickly grasp: "What is this? What is it generally about? What is its relevance/real-world significance to me (an ordinary person)?" Focus on providing an **overview**, not an accumulation of details.
* Be well-versed in cognitive science; understand how to phrase information so that someone without prior background can quickly understand the core content.
* **Extract key information and core insights,** rather than directly copying the original text. Do not omit crucial information and viewpoints. For example, for forum posts, the main points from comments are also very important!
* Avoid large blocks of text, strive for a combination of pictures and text.
`,
}

View File

@@ -30,6 +30,7 @@ import (
const (
AppName = "zenfeed"
Module = "github.com/glidea/zenfeed"
)
// LabelXXX is the metadata label for the feed.
@@ -82,6 +83,7 @@ func (ls *Labels) FromMap(m map[string]string) {
for k, v := range m {
*ls = append(*ls, Label{Key: k, Value: v})
}
ls.EnsureSorted()
}
func (ls Labels) Map() map[string]string {
@@ -233,6 +235,76 @@ type Label struct {
Value string `json:"value"`
}
const (
LabelFilterEqual = "="
LabelFilterNotEqual = "!="
)
type LabelFilter struct {
Label string
Equal bool
Value string
}
func NewLabelFilter(filter string) (LabelFilter, error) {
eq := false
parts := strings.Split(filter, LabelFilterNotEqual)
if len(parts) != 2 {
parts = strings.Split(filter, LabelFilterEqual)
eq = true
}
if len(parts) != 2 {
return LabelFilter{}, errors.New("invalid label filter")
}
return LabelFilter{Label: parts[0], Value: parts[1], Equal: eq}, nil
}
func (f LabelFilter) Match(labels Labels) bool {
lv := labels.Get(f.Label)
if lv == "" {
return false
}
if f.Equal && lv == f.Value {
return true
}
if !f.Equal && lv != f.Value {
return true
}
return false
}
type LabelFilters []LabelFilter
func (ls LabelFilters) Match(labels Labels) bool {
if len(ls) == 0 {
return true // No filters, always match.
}
for _, l := range ls {
if !l.Match(labels) {
return false
}
}
return true
}
func NewLabelFilters(filters []string) (LabelFilters, error) {
ls := make(LabelFilters, len(filters))
for i, f := range filters {
lf, err := NewLabelFilter(f)
if err != nil {
return nil, errors.Wrapf(err, "new label filter %q", f)
}
ls[i] = lf
}
return ls, nil
}
// readExpectedDelim reads the next token and checks if it's the expected delimiter.
func readExpectedDelim(dec *json.Decoder, expected json.Delim) error {
t, err := dec.Token()

View File

@@ -124,10 +124,10 @@ func (c *aggrChannel) Send(ctx context.Context, receiver Receiver, group *route.
if receiver.Email != "" && c.email != nil {
return c.send(ctx, receiver, group, c.email, "email")
}
// if receiver.Webhook != nil && c.webhook != nil {
// TODO: temporarily disable webhook to reduce copyright risks.
// return c.send(ctx, receiver, group, c.webhook, "webhook")
// }
if receiver.Webhook != nil && c.webhook != nil {
return c.send(ctx, receiver, group, c.webhook, "webhook")
}
return nil
}

View File

@@ -130,47 +130,57 @@ func (e *email) buildEmail(receiver Receiver, group *route.FeedGroup) (*gomail.M
m.SetHeader("To", receiver.Email)
m.SetHeader("Subject", group.Name)
body, err := e.buildBodyHTML(group.Feeds)
body, err := e.buildBodyHTML(group)
if err != nil {
return nil, errors.Wrap(err, "build email body HTML")
}
m.SetBody("text/html", string(body))
m.SetBody("text/html", body)
return m, nil
}
func (e *email) buildBodyHTML(feeds []*route.Feed) ([]byte, error) {
func (e *email) buildBodyHTML(group *route.FeedGroup) (string, error) {
bodyBuf := buffer.Get()
defer buffer.Put(bodyBuf)
// Write HTML header.
if err := e.writeHTMLHeader(bodyBuf); err != nil {
return nil, errors.Wrap(err, "write HTML header")
return "", errors.Wrap(err, "write HTML header")
}
// Write summary.
if err := e.writeSummary(bodyBuf, group.Summary); err != nil {
return "", errors.Wrap(err, "write summary")
}
// Write each feed content.
for i, feed := range feeds {
if _, err := bodyBuf.WriteString(`
<div style="margin-top:20px; padding-top:15px; border-top:1px solid #f1f3f4;">
<p style="font-size:32px; font-weight:500; margin:0 0 10px 0;">Feeds</p>`); err != nil {
return "", errors.Wrap(err, "write feeds header")
}
for i, feed := range group.Feeds {
if err := e.writeFeedContent(bodyBuf, feed); err != nil {
return nil, errors.Wrap(err, "write feed content")
return "", errors.Wrap(err, "write feed content")
}
// Add separator (except the last feed).
if i < len(feeds)-1 {
if i < len(group.Feeds)-1 {
if err := e.writeSeparator(bodyBuf); err != nil {
return nil, errors.Wrap(err, "write separator")
return "", errors.Wrap(err, "write separator")
}
}
}
// Write disclaimer and HTML footer.
if err := e.writeDisclaimer(bodyBuf); err != nil {
return nil, errors.Wrap(err, "write disclaimer")
return "", errors.Wrap(err, "write disclaimer")
}
if err := e.writeHTMLFooter(bodyBuf); err != nil {
return nil, errors.Wrap(err, "write HTML footer")
return "", errors.Wrap(err, "write HTML footer")
}
return bodyBuf.Bytes(), nil
return bodyBuf.String(), nil
}
func (e *email) writeHTMLHeader(buf *buffer.Bytes) error {
@@ -188,6 +198,29 @@ func (e *email) writeHTMLHeader(buf *buffer.Bytes) error {
return err
}
func (e *email) writeSummary(buf *buffer.Bytes, summary string) error {
if summary == "" {
return nil
}
if _, err := buf.WriteString(`
<p style="font-size:32px; font-weight:500; margin:0 0 10px 0;">Summary</p>`); err != nil {
return errors.Wrap(err, "write summary header")
}
contentHTML, err := textconvert.MarkdownToHTML([]byte(summary))
if err != nil {
return errors.Wrap(err, "markdown to HTML")
}
contentHTMLWithStyle := fmt.Sprintf(`<div style="font-size:16px; line-height:1.8;">%s</div>`, contentHTML)
if _, err := buf.WriteString(contentHTMLWithStyle); err != nil {
return errors.Wrap(err, "write summary")
}
return nil
}
const timeLayout = "01-02 15:04"
func (e *email) writeFeedContent(buf *buffer.Bytes, feed *route.Feed) error {
@@ -311,7 +344,8 @@ func (e *email) renderMarkdownContent(buf *buffer.Bytes, feed *route.Feed) (n in
return 0, errors.Wrap(err, "markdown to HTML")
}
if _, err := buf.Write(contentHTML); err != nil {
contentHTMLWithStyle := fmt.Sprintf(`<div style="font-size:16px; line-height:1.8;">%s</div>`, contentHTML)
if _, err := buf.WriteString(contentHTMLWithStyle); err != nil {
return 0, errors.Wrap(err, "write content HTML")
}

View File

@@ -41,9 +41,10 @@ func (r *WebhookReceiver) Validate() error {
}
type webhookBody struct {
Group string `json:"group"`
Labels model.Labels `json:"labels"`
Feeds []*route.Feed `json:"feeds"`
Group string `json:"group"`
Labels model.Labels `json:"labels"`
Summary string `json:"summary"`
Feeds []*route.Feed `json:"feeds"`
}
func newWebhook() sender {
@@ -59,9 +60,10 @@ type webhook struct {
func (w *webhook) Send(ctx context.Context, receiver Receiver, group *route.FeedGroup) error {
// Prepare request.
body := &webhookBody{
Group: group.Name,
Labels: group.Labels,
Feeds: group.Feeds,
Group: group.Name,
Labels: group.Labels,
Summary: group.Summary,
Feeds: group.Feeds,
}
b := runtimeutil.Must1(json.Marshal(body))
req, err := http.NewRequestWithContext(ctx, http.MethodPost, receiver.Webhook.URL, bytes.NewReader(b))

View File

@@ -27,6 +27,7 @@ import (
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
"github.com/glidea/zenfeed/pkg/llm"
"github.com/glidea/zenfeed/pkg/notify/channel"
"github.com/glidea/zenfeed/pkg/notify/route"
"github.com/glidea/zenfeed/pkg/schedule/rule"
@@ -67,6 +68,9 @@ func (c *Config) From(app *config.App) *Config {
c.Route = route.Config{
Route: route.Route{
GroupBy: app.Notify.Route.GroupBy,
SourceLabel: app.Notify.Route.SourceLabel,
SummaryPrompt: app.Notify.Route.SummaryPrompt,
LLM: app.Notify.Route.LLM,
CompressByRelatedThreshold: app.Notify.Route.CompressByRelatedThreshold,
Receivers: app.Notify.Route.Receivers,
},
@@ -82,9 +86,9 @@ func (c *Config) From(app *config.App) *Config {
if app.Notify.Receivers[i].Email != "" {
c.Receivers[i].Email = app.Notify.Receivers[i].Email
}
// if app.Notify.Receivers[i].Webhook != nil {
// c.Receivers[i].Webhook = &channel.WebhookReceiver{URL: app.Notify.Receivers[i].Webhook.URL}
// }
if app.Notify.Receivers[i].Webhook != nil {
c.Receivers[i].Webhook = &channel.WebhookReceiver{URL: app.Notify.Receivers[i].Webhook.URL}
}
}
c.Channels = channel.Config{}
@@ -105,6 +109,9 @@ func convertSubRoute(from *config.NotifySubRoute) *route.SubRoute {
to := &route.SubRoute{
Route: route.Route{
GroupBy: from.GroupBy,
SourceLabel: from.SourceLabel,
SummaryPrompt: from.SummaryPrompt,
LLM: from.LLM,
CompressByRelatedThreshold: from.CompressByRelatedThreshold,
Receivers: from.Receivers,
},
@@ -169,6 +176,7 @@ type Dependencies struct {
RouterFactory route.Factory
ChannelFactory channel.Factory
KVStorage kv.Storage
LLMFactory llm.Factory
}
// --- Factory code block ---
@@ -322,7 +330,10 @@ func (n *notifier) newRouter(config *route.Config) (route.Router, error) {
return n.Dependencies().RouterFactory.New(
n.Instance(),
config,
route.Dependencies{RelatedScore: n.Dependencies().RelatedScore},
route.Dependencies{
RelatedScore: n.Dependencies().RelatedScore,
LLMFactory: n.Dependencies().LLMFactory,
},
)
}
@@ -339,7 +350,7 @@ func (n *notifier) handle(ctx context.Context, result *rule.Result) {
router := n.router
n.mu.RUnlock()
groups, err := router.Route(result)
groups, err := router.Route(ctx, result)
if err != nil {
// We don't retry in notifier, retry should be upstream.
log.Error(ctx, errors.Wrap(err, "route"))
@@ -427,8 +438,8 @@ func (n *notifier) send(ctx context.Context, work sendWork) error {
return channel.Send(ctx, work.receiver.Receiver, work.group)
}
var nlogKey = func(group *route.FeedGroup, receiver Receiver) string {
return fmt.Sprintf("notifier.group.%s.receiver.%s", group.Name, receiver.Name)
var nlogKey = func(group *route.FeedGroup, receiver Receiver) []byte {
return fmt.Appendf(nil, "notifier.group.%s.receiver.%s.%d", group.Name, receiver.Name, group.Time.Unix())
}
func (n *notifier) isSent(ctx context.Context, group *route.FeedGroup, receiver Receiver) bool {
@@ -446,7 +457,7 @@ func (n *notifier) isSent(ctx context.Context, group *route.FeedGroup, receiver
}
func (n *notifier) markSent(ctx context.Context, group *route.FeedGroup, receiver Receiver) error {
return n.Dependencies().KVStorage.Set(ctx, nlogKey(group, receiver), timeutil.Format(time.Now()), timeutil.Day)
return n.Dependencies().KVStorage.Set(ctx, nlogKey(group, receiver), []byte(timeutil.Format(time.Now())), timeutil.Day)
}
type sendWork struct {

View File

@@ -16,6 +16,8 @@
package route
import (
"context"
"encoding/json"
"fmt"
"sort"
"strings"
@@ -25,16 +27,20 @@ import (
"k8s.io/utils/ptr"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/llm"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/schedule/rule"
"github.com/glidea/zenfeed/pkg/storage/feed/block"
"github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
runtimeutil "github.com/glidea/zenfeed/pkg/util/runtime"
timeutil "github.com/glidea/zenfeed/pkg/util/time"
)
// --- Interface code block ---
type Router interface {
component.Component
Route(result *rule.Result) (groups []*Group, err error)
Route(ctx context.Context, result *rule.Result) (groups []*Group, err error)
}
type Config struct {
@@ -43,6 +49,9 @@ type Config struct {
type Route struct {
GroupBy []string
SourceLabel string
SummaryPrompt string
LLM string
CompressByRelatedThreshold *float32
Receivers []string
SubRoutes SubRoutes
@@ -63,56 +72,25 @@ func (s SubRoutes) Match(feed *block.FeedVO) *SubRoute {
type SubRoute struct {
Route
Matchers []string
matchers []matcher
matchers model.LabelFilters
}
func (r *SubRoute) Match(feed *block.FeedVO) *SubRoute {
// Match sub routes.
for _, subRoute := range r.SubRoutes {
if matched := subRoute.Match(feed); matched != nil {
return matched
}
}
for _, m := range r.matchers {
fv := feed.Labels.Get(m.key)
switch m.equal {
case true:
if fv != m.value {
return nil
}
default:
if fv == m.value {
return nil
}
}
// Match self.
if !r.matchers.Match(feed.Labels) {
return nil
}
return r
}
type matcher struct {
key string
value string
equal bool
}
var (
matcherEqual = "="
matcherNotEqual = "!="
parseMatcher = func(filter string) (matcher, error) {
eq := false
parts := strings.Split(filter, matcherNotEqual)
if len(parts) != 2 {
parts = strings.Split(filter, matcherEqual)
eq = true
}
if len(parts) != 2 {
return matcher{}, errors.New("invalid matcher")
}
return matcher{key: parts[0], value: parts[1], equal: eq}, nil
}
)
func (r *SubRoute) Validate() error {
if len(r.GroupBy) == 0 {
r.GroupBy = []string{model.LabelSource}
@@ -120,17 +98,16 @@ func (r *SubRoute) Validate() error {
if r.CompressByRelatedThreshold == nil {
r.CompressByRelatedThreshold = ptr.To(float32(0.85))
}
if len(r.Matchers) == 0 {
return errors.New("matchers is required")
}
r.matchers = make([]matcher, len(r.Matchers))
for i, matcher := range r.Matchers {
m, err := parseMatcher(matcher)
if err != nil {
return errors.Wrap(err, "invalid matcher")
}
r.matchers[i] = m
matchers, err := model.NewLabelFilters(r.Matchers)
if err != nil {
return errors.Wrap(err, "invalid matchers")
}
r.matchers = matchers
for _, subRoute := range r.SubRoutes {
if err := subRoute.Validate(); err != nil {
return errors.Wrap(err, "invalid sub_route")
@@ -142,7 +119,7 @@ func (r *SubRoute) Validate() error {
func (c *Config) Validate() error {
if len(c.GroupBy) == 0 {
c.GroupBy = []string{model.LabelSource}
c.GroupBy = []string{model.LabelType}
}
if c.CompressByRelatedThreshold == nil {
c.CompressByRelatedThreshold = ptr.To(float32(0.85))
@@ -158,6 +135,7 @@ func (c *Config) Validate() error {
type Dependencies struct {
RelatedScore func(a, b [][]float32) (float32, error) // MUST same with vector index.
LLMFactory llm.Factory
}
type Group struct {
@@ -166,10 +144,11 @@ type Group struct {
}
type FeedGroup struct {
Name string
Time time.Time
Labels model.Labels
Feeds []*Feed
Name string
Time time.Time
Labels model.Labels
Summary string
Feeds []*Feed
}
func (g *FeedGroup) ID() string {
@@ -216,7 +195,10 @@ type router struct {
*component.Base[Config, Dependencies]
}
func (r *router) Route(result *rule.Result) (groups []*Group, err error) {
func (r *router) Route(ctx context.Context, result *rule.Result) (groups []*Group, err error) {
ctx = telemetry.StartWith(ctx, append(r.TelemetryLabels(), telemetrymodel.KeyOperation, "Route")...)
defer func() { telemetry.End(ctx, err) }()
// Find route for each feed.
feedsByRoute := r.routeFeeds(result.Feeds)
@@ -233,12 +215,21 @@ func (r *router) Route(result *rule.Result) (groups []*Group, err error) {
// Build final groups.
for ls, feeds := range relatedGroups {
var summary string
if prompt := route.SummaryPrompt; prompt != "" && len(feeds) > 1 {
// TODO: Avoid potential for duplicate generation.
summary, err = r.generateSummary(ctx, prompt, feeds, route.SourceLabel)
if err != nil {
return nil, errors.Wrap(err, "generate summary")
}
}
groups = append(groups, &Group{
FeedGroup: FeedGroup{
Name: fmt.Sprintf("%s %s", result.Rule, ls.String()),
Time: result.Time,
Labels: *ls,
Feeds: feeds,
Name: fmt.Sprintf("%s %s", result.Rule, ls.String()),
Time: result.Time,
Labels: *ls,
Feeds: feeds,
Summary: summary,
},
Receivers: route.Receivers,
})
@@ -252,6 +243,44 @@ func (r *router) Route(result *rule.Result) (groups []*Group, err error) {
return groups, nil
}
func (r *router) generateSummary(
ctx context.Context,
prompt string,
feeds []*Feed,
sourceLabel string,
) (string, error) {
content := r.parseContentToSummary(feeds, sourceLabel)
if content == "" {
return "", nil
}
llm := r.Dependencies().LLMFactory.Get(r.Config().LLM)
summary, err := llm.String(ctx, []string{
content,
prompt,
})
if err != nil {
return "", errors.Wrap(err, "llm string")
}
return summary, nil
}
func (r *router) parseContentToSummary(feeds []*Feed, sourceLabel string) string {
if sourceLabel == "" {
b := runtimeutil.Must1(json.Marshal(feeds))
return string(b)
}
var sb strings.Builder
for _, feed := range feeds {
sb.WriteString(feed.Labels.Get(sourceLabel))
}
return sb.String()
}
func (r *router) routeFeeds(feeds []*block.FeedVO) map[*Route][]*block.FeedVO {
config := r.Config()
feedsByRoute := make(map[*Route][]*block.FeedVO)
@@ -290,6 +319,12 @@ func (r *router) groupFeedsByLabels(route *Route, feeds []*block.FeedVO) map[*mo
groupedFeeds[labelGroup] = append(groupedFeeds[labelGroup], feed)
}
for _, feeds := range groupedFeeds {
sort.Slice(feeds, func(i, j int) bool {
return feeds[i].ID < feeds[j].ID
})
}
return groupedFeeds
}
@@ -344,6 +379,16 @@ func (r *router) compressRelatedFeedsForGroup(
}
}
// Sort.
sort.Slice(feedsWithRelated, func(i, j int) bool {
return feedsWithRelated[i].ID < feedsWithRelated[j].ID
})
for _, feed := range feedsWithRelated {
sort.Slice(feed.Related, func(i, j int) bool {
return feed.Related[i].ID < feed.Related[j].ID
})
}
return feedsWithRelated, nil
}
@@ -351,8 +396,8 @@ type mockRouter struct {
component.Mock
}
func (m *mockRouter) Route(result *rule.Result) (groups []*Group, err error) {
m.Called(result)
func (m *mockRouter) Route(ctx context.Context, result *rule.Result) (groups []*Group, err error) {
m.Called(ctx, result)
return groups, err
}

View File

@@ -1,6 +1,7 @@
package route
import (
"context"
"fmt"
"testing"
"time"
@@ -11,6 +12,7 @@ import (
"k8s.io/utils/ptr"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/llm"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/schedule/rule"
"github.com/glidea/zenfeed/pkg/storage/feed/block"
@@ -382,6 +384,11 @@ func TestRoute(t *testing.T) {
tt.GivenDetail.relatedScore(&mockDep.Mock)
}
llmFactory, err := llm.NewFactory("", nil, llm.FactoryDependencies{}, component.MockOption(func(m *mock.Mock) {
m.On("String", mock.Anything, mock.Anything).Return("test", nil)
}))
Expect(err).NotTo(HaveOccurred())
routerInstance := &router{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "TestRouter",
@@ -389,11 +396,12 @@ func TestRoute(t *testing.T) {
Config: tt.GivenDetail.config,
Dependencies: Dependencies{
RelatedScore: mockDep.RelatedScore,
LLMFactory: llmFactory,
},
}),
}
groups, err := routerInstance.Route(tt.WhenDetail.ruleResult)
groups, err := routerInstance.Route(context.Background(), tt.WhenDetail.ruleResult)
if tt.ThenExpected.isErr {
Expect(err).To(HaveOccurred())
@@ -529,7 +537,7 @@ func TestConfig_Validate(t *testing.T) {
},
},
wantErr: true,
errMsg: "invalid sub_route: invalid matcher: invalid matcher",
errMsg: "invalid sub_route: invalid matchers: new label filter",
},
{
name: "Valid nested sub-route",

View File

@@ -17,10 +17,14 @@ package rewrite
import (
"context"
"fmt"
"html/template"
"io"
"regexp"
"strconv"
"strings"
"time"
"unicode/utf8"
"unsafe"
"github.com/pkg/errors"
"k8s.io/utils/ptr"
@@ -28,14 +32,17 @@ import (
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
"github.com/glidea/zenfeed/pkg/llm"
"github.com/glidea/zenfeed/pkg/llm/prompt"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/storage/object"
"github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
"github.com/glidea/zenfeed/pkg/util/buffer"
"github.com/glidea/zenfeed/pkg/util/crawl"
hashutil "github.com/glidea/zenfeed/pkg/util/hash"
)
// --- Interface code block ---
type Rewriter interface {
component.Component
config.Watcher
@@ -67,10 +74,16 @@ func (c *Config) From(app *config.App) {
}
type Dependencies struct {
LLMFactory llm.Factory
LLMFactory llm.Factory // NOTE: String() with cache.
ObjectStorage object.Storage
}
type Rule struct {
// If is the condition to check before applying the rule.
// If not set, the rule will be applied.
If []string
if_ model.LabelFilters
// SourceLabel specifies which label's value to use as source text.
// Default is model.LabelContent.
SourceLabel string
@@ -95,30 +108,118 @@ type Rule struct {
Label string
}
func (r *Rule) Validate() error { //nolint:cyclop
func (r *Rule) Validate() error { //nolint:cyclop,gocognit,funlen
// If.
if len(r.If) > 0 {
if_, err := model.NewLabelFilters(r.If)
if err != nil {
return errors.Wrapf(err, "invalid if %q", r.If)
}
r.if_ = if_
}
// Source label.
if r.SourceLabel == "" {
r.SourceLabel = model.LabelContent
}
if r.SkipTooShortThreshold == nil {
r.SkipTooShortThreshold = ptr.To(300)
r.SkipTooShortThreshold = ptr.To(0)
}
// Transform.
if r.Transform != nil {
if r.Transform.ToText.Prompt == "" {
return errors.New("to text prompt is required")
if r.Transform != nil { //nolint:nestif
if r.Transform.ToText != nil && r.Transform.ToPodcast != nil {
return errors.New("to_text and to_podcast can not be set at same time")
}
tmpl, err := template.New("").Parse(r.Transform.ToText.Prompt)
if err != nil {
return errors.Wrapf(err, "parse prompt template %s", r.Transform.ToText.Prompt)
if r.Transform.ToText == nil && r.Transform.ToPodcast == nil {
return errors.New("either to_text or to_podcast must be set when transform is set")
}
buf := buffer.Get()
defer buffer.Put(buf)
if err := tmpl.Execute(buf, promptTemplates); err != nil {
return errors.Wrapf(err, "execute prompt template %s", r.Transform.ToText.Prompt)
if r.Transform.ToText != nil {
switch r.Transform.ToText.Type {
case ToTextTypePrompt:
if r.Transform.ToText.Prompt == "" {
return errors.New("to text prompt is required for prompt type")
}
tmpl, err := template.New("").Parse(r.Transform.ToText.Prompt)
if err != nil {
return errors.Wrapf(err, "parse prompt template %s", r.Transform.ToText.Prompt)
}
buf := buffer.Get()
defer buffer.Put(buf)
if err := tmpl.Execute(buf, prompt.Builtin); err != nil {
return errors.Wrapf(err, "execute prompt template %s", r.Transform.ToText.Prompt)
}
r.Transform.ToText.promptRendered = buf.String()
case ToTextTypeCrawl, ToTextTypeCrawlByJina:
// No specific validation for crawl type here, as the source text itself is the URL.
default:
return errors.Errorf("unknown transform type: %s", r.Transform.ToText.Type)
}
}
if r.Transform.ToPodcast != nil {
if len(r.Transform.ToPodcast.Speakers) == 0 {
return errors.New("at least one speaker is required for to_podcast")
}
r.Transform.ToPodcast.speakers = make([]llm.Speaker, len(r.Transform.ToPodcast.Speakers))
var speakerDescs []string
var speakerNames []string
for i, s := range r.Transform.ToPodcast.Speakers {
if s.Name == "" {
return errors.New("speaker name is required")
}
if s.Voice == "" {
return errors.New("speaker voice is required")
}
r.Transform.ToPodcast.speakers[i] = llm.Speaker{Name: s.Name, Voice: s.Voice}
desc := s.Name
if s.Role != "" {
desc += " (" + s.Role + ")"
}
speakerDescs = append(speakerDescs, desc)
speakerNames = append(speakerNames, s.Name)
}
speakersDesc := "- " + strings.Join(speakerDescs, "\n- ")
exampleSpeaker1 := speakerNames[0]
exampleSpeaker2 := exampleSpeaker1
if len(speakerNames) > 1 {
exampleSpeaker2 = speakerNames[1]
}
promptSegments := []string{
"Please convert the following article into a podcast dialogue script.",
"The speakers are:\n" + speakersDesc,
}
if r.Transform.ToPodcast.EstimateMaximumDuration > 0 {
wordsPerMinute := 200
totalMinutes := int(r.Transform.ToPodcast.EstimateMaximumDuration.Minutes())
estimatedWords := totalMinutes * wordsPerMinute
promptSegments = append(promptSegments, fmt.Sprintf("The script should be approximately %d words to fit within a %d-minute duration. If the original content is not sufficient, the script can be shorter as appropriate.", estimatedWords, totalMinutes))
}
if r.Transform.ToPodcast.TranscriptAdditionalPrompt != "" {
promptSegments = append(promptSegments, "Additional instructions: "+r.Transform.ToPodcast.TranscriptAdditionalPrompt)
}
promptSegments = append(promptSegments,
"The output format MUST be a script where each line starts with the speaker's name followed by a colon and a space.",
"Do NOT include any other text, explanations, or formatting before or after the script.",
"Do NOT use background music in the script.",
"Do NOT include any greetings or farewells (e.g., 'Hello everyone', 'Welcome to our show', 'Goodbye').",
fmt.Sprintf("Example of the required format:\n%s: Today we are discussing the article's main points.\n%s: Let's start with the first one.", exampleSpeaker1, exampleSpeaker2),
"Now, convert the article.",
)
r.Transform.ToPodcast.transcriptPrompt = strings.Join(promptSegments, "\n\n")
r.Transform.ToPodcast.speakersDesc = speakersDesc
}
r.Transform.ToText.promptRendered = buf.String()
}
// Match.
@@ -132,9 +233,10 @@ func (r *Rule) Validate() error { //nolint:cyclop
r.matchRE = re
// Action.
switch r.Action {
case "":
if r.Action == "" {
r.Action = ActionCreateOrUpdateLabel
}
switch r.Action {
case ActionCreateOrUpdateLabel:
if r.Label == "" {
return errors.New("label is required for create or update label action")
@@ -148,15 +250,40 @@ func (r *Rule) Validate() error { //nolint:cyclop
}
func (r *Rule) From(c *config.RewriteRule) {
r.If = c.If
r.SourceLabel = c.SourceLabel
r.SkipTooShortThreshold = c.SkipTooShortThreshold
if c.Transform != nil {
if c.Transform != nil { //nolint:nestif
t := &Transform{}
if c.Transform.ToText != nil {
t.ToText = &ToText{
toText := &ToText{
LLM: c.Transform.ToText.LLM,
Prompt: c.Transform.ToText.Prompt,
}
toText.Type = ToTextType(c.Transform.ToText.Type)
if toText.Type == "" {
toText.Type = ToTextTypePrompt // Default to prompt if not specified.
}
t.ToText = toText
}
if c.Transform.ToPodcast != nil {
toPodcast := &ToPodcast{
LLM: c.Transform.ToPodcast.LLM,
EstimateMaximumDuration: time.Duration(c.Transform.ToPodcast.EstimateMaximumDuration),
TranscriptAdditionalPrompt: c.Transform.ToPodcast.TranscriptAdditionalPrompt,
TTSLLM: c.Transform.ToPodcast.TTSLLM,
}
if toPodcast.EstimateMaximumDuration == 0 {
toPodcast.EstimateMaximumDuration = 3 * time.Minute
}
for _, s := range c.Transform.ToPodcast.Speakers {
toPodcast.Speakers = append(toPodcast.Speakers, Speaker{
Name: s.Name,
Role: s.Role,
Voice: s.Voice,
})
}
t.ToPodcast = toPodcast
}
r.Transform = t
}
@@ -169,19 +296,50 @@ func (r *Rule) From(c *config.RewriteRule) {
}
type Transform struct {
ToText *ToText
ToText *ToText
ToPodcast *ToPodcast
}
type ToText struct {
Type ToTextType
// LLM is the name of the LLM to use.
// Only used when Type is ToTextTypePrompt.
LLM string
// Prompt is the prompt for LLM completion.
// The source text will automatically be injected into the prompt.
// Only used when Type is ToTextTypePrompt.
Prompt string
promptRendered string
}
type ToPodcast struct {
LLM string
EstimateMaximumDuration time.Duration
TranscriptAdditionalPrompt string
TTSLLM string
Speakers []Speaker
transcriptPrompt string
speakersDesc string
speakers []llm.Speaker
}
type Speaker struct {
Name string
Role string
Voice string
}
type ToTextType string
const (
ToTextTypePrompt ToTextType = "prompt"
ToTextTypeCrawl ToTextType = "crawl"
ToTextTypeCrawlByJina ToTextType = "crawl_by_jina"
)
type Action string
const (
@@ -189,233 +347,7 @@ const (
ActionCreateOrUpdateLabel Action = "create_or_update_label"
)
var promptTemplates = map[string]string{
"category": `
Analyze the content and categorize it into exactly one of these categories:
Technology, Development, Entertainment, Finance, Health, Politics, Other
Classification requirements:
- Choose the SINGLE most appropriate category based on:
* Primary topic and main focus of the content
* Key terminology and concepts used
* Target audience and purpose
* Technical depth and complexity level
- For content that could fit multiple categories:
* Identify the dominant theme
* Consider the most specific applicable category
* Use the primary intended purpose
- If content appears ambiguous:
* Focus on the most prominent aspects
* Consider the practical application
* Choose the category that best serves user needs
Output format:
Return ONLY the category name, no other text or explanation.
Must be one of the provided categories exactly as written.
`,
"tags": `
Analyze the content and add appropriate tags based on:
- Main topics and themes
- Key concepts and terminology
- Target audience and purpose
- Technical depth and domain
- 2-4 tags are enough
Output format:
Return a list of tags, separated by commas, no other text or explanation.
e.g. "AI, Technology, Innovation, Future"
`,
"score": `
Please give a score between 0 and 10 based on the following content.
Evaluate the content comprehensively considering clarity, accuracy, depth, logical structure, language expression, and completeness.
Note: If the content is an article or a text intended to be detailed, the length is an important factor. Generally, content under 300 words may receive a lower score due to lack of substance, unless its type (such as poetry or summary) is inherently suitable for brevity.
Output format:
Return the score (0-10), no other text or explanation.
E.g. "8", "5", "3", etc.
`,
"comment_confucius": `
Please act as Confucius and write a 100-word comment on the article.
Content needs to be in line with the Chinese mainland's regulations.
Output format:
Return the comment only, no other text or explanation.
Reply short and concise, 100 words is enough.
`,
"summary": `
Summarize the article in 100-200 words.
`,
"summary_html_snippet": `
# Task: Create Visually Appealing Information Summary Emails
You are a professional content designer. Please convert the provided articles into **visually modern HTML email segments**, focusing on display effects in modern clients like Gmail and QQ Mail.
## Key Requirements:
1. **Output Format**:
- Only output HTML code snippets, **no need for complete HTML document structure**
- Only generate HTML code for a single article, so users can combine multiple pieces into a complete email
- No explanations, additional comments, or markups
- **No need to add titles and sources**, users will inject them automatically
- No use html backticks, output raw html code directly
- Output directly, no explanation, no comments, no markups
2. **Content Processing**:
- **Don't directly copy the original text**, but extract key information and core insights from each article
- **Each article summary should be 100-200 words**, don't force word count, adjust the word count based on the actual length of the article
- Summarize points in relaxed, natural language, as if chatting with friends, while maintaining depth
- Maintain the original language of the article (e.g., Chinese summary for Chinese articles)
3. **Visual Design**:
- Design should be aesthetically pleasing with coordinated colors
- Use sufficient whitespace and contrast
- Maintain a consistent visual style across all articles
- **Must use multiple visual elements** (charts, cards, quote blocks, etc.), avoid pure text presentation
- Each article should use at least 2-3 different visual elements to make content more intuitive and readable
4. **Highlight Techniques**:
A. **Beautiful Quote Blocks** (for highlighting important viewpoints):
<div style="margin:20px 0; padding:20px; background:linear-gradient(to right, #f8f9fa, #ffffff); border-left:5px solid #4285f4; border-radius:5px; box-shadow:0 2px 8px rgba(0,0,0,0.05);">
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; line-height:1.6; color:#333; font-weight:500;">
Here is the key viewpoint or finding that needs to be highlighted.
</p>
</div>
B. **Information Cards** (for highlighting key data):
<div style="display:inline-block; margin:10px 10px 10px 0; padding:15px 20px; background-color:#ffffff; border-radius:8px; box-shadow:0 3px 10px rgba(0,0,0,0.08); min-width:120px; text-align:center;">
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#666;">Metric Name</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:24px; font-weight:600; color:#1a73e8;">75%</p>
</div>
C. **Key Points List** (for highlighting multiple points):
<ul style="margin:20px 0; padding-left:0; list-style-type:none;">
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">1</span>
First point description
</li>
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">2</span>
Second point description
</li>
</ul>
D. **Emphasis Text** (for highlighting key words or phrases):
<span style="background:linear-gradient(180deg, rgba(255,255,255,0) 50%, rgba(66,133,244,0.2) 50%); padding:0 2px;">Text to emphasize</span>
5. **Timeline Design** (suitable for event sequences or news developments):
<div style="margin:25px 0; padding:5px 0;">
<h3 style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:18px; color:#333; margin-bottom:15px;">Event Development Timeline</h3>
<div style="position:relative; margin-left:30px; padding-left:30px; border-left:2px solid #e0e0e0;">
<!-- Time Point 1 -->
<div style="position:relative; margin-bottom:25px;">
<div style="position:absolute; width:16px; height:16px; background-color:#4285f4; border-radius:50%; left:-40px; top:0; border:3px solid #ffffff; box-shadow:0 2px 5px rgba(0,0,0,0.1);"></div>
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#4285f4;">June 1, 2023</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.5; color:#333;">Event description content, concisely explaining the key points and impact of the event.</p>
</div>
<!-- Time Point 2 -->
<div style="position:relative; margin-bottom:25px;">
<div style="position:absolute; width:16px; height:16px; background-color:#4285f4; border-radius:50%; left:-40px; top:0; border:3px solid #ffffff; box-shadow:0 2px 5px rgba(0,0,0,0.1);"></div>
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#4285f4;">June 15, 2023</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.5; color:#333;">Event description content, concisely explaining the key points and impact of the event.</p>
</div>
</div>
</div>
6. **Comparison Table** (for comparing different options or viewpoints):
<div style="margin:25px 0; padding:15px; background-color:#f8f9fa; border-radius:8px; overflow-x:auto;">
<table style="width:100%; border-collapse:collapse; font-family:'Google Sans',Roboto,Arial,sans-serif;">
<thead>
<tr>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Feature</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option A</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option B</th>
</tr>
</thead>
<tbody>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Cost</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Higher</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Moderate</td>
</tr>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Efficiency</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Very High</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Average</td>
</tr>
</tbody>
</table>
</div>
7. **Chart Data Processing**:
- Bar Chart/Horizontal Bars:
<div style="margin:20px 0; padding:15px; background-color:#f8f9fa; border-radius:8px;">
<p style="margin:0 0 15px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; font-weight:500; color:#333;">Data Comparison</p>
<!-- Item 1 -->
<div style="margin-bottom:12px;">
<div style="display:flex; align-items:center; justify-content:space-between; margin-bottom:5px;">
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#555;">Project A</span>
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#333;">65%</span>
</div>
<div style="height:10px; width:100%; background-color:#e8eaed; border-radius:5px; overflow:hidden;">
<div style="height:100%; width:65%; background:linear-gradient(to right, #4285f4, #5e97f6); border-radius:5px;"></div>
</div>
</div>
<!-- Item 2 -->
<div style="margin-bottom:12px;">
<div style="display:flex; align-items:center; justify-content:space-between; margin-bottom:5px;">
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#555;">Project B</span>
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#333;">42%</span>
</div>
<div style="height:10px; width:100%; background-color:#e8eaed; border-radius:5px; overflow:hidden;">
<div style="height:100%; width:42%; background:linear-gradient(to right, #ea4335, #f07575); border-radius:5px;"></div>
</div>
</div>
</div>
8. **Highlight Box** (for displaying tips or reminders):
<div style="margin:25px 0; padding:20px; background-color:#fffde7; border-radius:8px; border-left:4px solid #fdd835; box-shadow:0 1px 5px rgba(0,0,0,0.05);">
<div style="display:flex; align-items:flex-start;">
<div style="flex-shrink:0; margin-right:15px; width:24px; height:24px; background-color:#fdd835; border-radius:50%; display:flex; align-items:center; justify-content:center;">
<span style="color:#fff; font-weight:bold; font-size:16px;">!</span>
</div>
<div>
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; font-weight:500; color:#333;">Tip</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#555;">
Here are some additional tips or suggestions to help readers better understand or apply the article content.
</p>
</div>
</div>
</div>
9. **Summary Box**:
<div style="margin:25px 0; padding:20px; background-color:#f2f7fd; border-radius:8px; box-shadow:0 1px 5px rgba(66,133,244,0.1);">
<p style="margin:0 0 10px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; font-weight:500; color:#1a73e8;">In Simple Terms</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#333;">
This is a concise summary of the entire content, highlighting the most critical findings and conclusions.
</p>
</div>
## Notes:
1. **Only generate content for a single article**, not including title and source, and not including HTML head and tail structure
2. Content should be **200-300 words**, don't force word count
3. **Must use multiple visual elements** (at least 2-3 types), avoid monotonous pure text presentation
4. Use relaxed, natural language, as if chatting with friends
5. Create visual charts for important data, rather than just describing with text
6. Use quote blocks to highlight important viewpoints, and lists to organize multiple points
7. Appropriately use emojis and conversational expressions to increase friendliness
8. Note that the article content has been provided in the previous message, please reply directly, no explanation, no comments, no markups
`,
}
// --- Factory code block ---
type Factory component.Factory[Rewriter, config.App, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
@@ -445,6 +377,8 @@ func new(instance string, app *config.App, dependencies Dependencies) (Rewriter,
Config: c,
Dependencies: dependencies,
}),
crawler: crawl.NewLocal(),
jinaCrawler: crawl.NewJina(app.Jina.Token),
}, nil
}
@@ -452,6 +386,9 @@ func new(instance string, app *config.App, dependencies Dependencies) (Rewriter,
type rewriter struct {
*component.Base[Config, Dependencies]
crawler crawl.Crawler
jinaCrawler crawl.Crawler
}
func (r *rewriter) Reload(app *config.App) error {
@@ -462,15 +399,22 @@ func (r *rewriter) Reload(app *config.App) error {
}
r.SetConfig(newConfig)
r.jinaCrawler = crawl.NewJina(app.Jina.Token)
return nil
}
func (r *rewriter) Labels(ctx context.Context, labels model.Labels) (model.Labels, error) {
func (r *rewriter) Labels(ctx context.Context, labels model.Labels) (rewritten model.Labels, err error) {
ctx = telemetry.StartWith(ctx, append(r.TelemetryLabels(), telemetrymodel.KeyOperation, "Labels")...)
defer func() { telemetry.End(ctx, nil) }()
defer func() { telemetry.End(ctx, err) }()
rules := *r.Config()
for _, rule := range rules {
// If.
if !rule.if_.Match(labels) {
continue
}
// Get source text based on source label.
sourceText := labels.Get(rule.SourceLabel)
if utf8.RuneCountInString(sourceText) < *rule.SkipTooShortThreshold {
@@ -478,13 +422,9 @@ func (r *rewriter) Labels(ctx context.Context, labels model.Labels) (model.Label
}
// Transform text if configured.
text := sourceText
if rule.Transform != nil {
transformed, err := r.transformText(ctx, rule.Transform, sourceText)
if err != nil {
return nil, errors.Wrap(err, "transform text")
}
text = transformed
text, err := r.transform(ctx, rule.Transform, sourceText)
if err != nil {
return nil, errors.Wrap(err, "transform")
}
// Check if text matches the rule.
@@ -506,15 +446,54 @@ func (r *rewriter) Labels(ctx context.Context, labels model.Labels) (model.Label
return labels, nil
}
// transformText transforms text using configured LLM.
func (r *rewriter) transformText(ctx context.Context, transform *Transform, text string) (string, error) {
func (r *rewriter) transform(ctx context.Context, transform *Transform, sourceText string) (string, error) {
if transform == nil {
return sourceText, nil
}
if transform.ToText != nil {
return r.transformText(ctx, transform.ToText, sourceText)
}
if transform.ToPodcast != nil {
return r.transformPodcast(ctx, transform.ToPodcast, sourceText)
}
return sourceText, nil
}
// transformText transforms text using configured LLM or by crawling a URL.
func (r *rewriter) transformText(ctx context.Context, toText *ToText, text string) (string, error) {
switch toText.Type {
case ToTextTypeCrawl:
return r.transformTextCrawl(ctx, r.crawler, text)
case ToTextTypeCrawlByJina:
return r.transformTextCrawl(ctx, r.jinaCrawler, text)
case ToTextTypePrompt:
return r.transformTextPrompt(ctx, toText, text)
default:
return r.transformTextPrompt(ctx, toText, text)
}
}
func (r *rewriter) transformTextCrawl(ctx context.Context, crawler crawl.Crawler, url string) (string, error) {
mdBytes, err := crawler.Markdown(ctx, url)
if err != nil {
return "", errors.Wrapf(err, "crawl %s", url)
}
return string(mdBytes), nil
}
// transformTextPrompt transforms text using configured LLM.
func (r *rewriter) transformTextPrompt(ctx context.Context, toText *ToText, text string) (string, error) {
// Get LLM instance.
llm := r.Dependencies().LLMFactory.Get(transform.ToText.LLM)
llm := r.Dependencies().LLMFactory.Get(toText.LLM)
// Call completion.
result, err := llm.String(ctx, []string{
transform.ToText.promptRendered,
"The content to be processed is below, and the processing requirements are as above",
toText.promptRendered,
text, // TODO: may place to first line to hit the model cache in different rewrite rules.
})
if err != nil {
@@ -525,32 +504,77 @@ func (r *rewriter) transformText(ctx context.Context, transform *Transform, text
}
func (r *rewriter) transformTextHack(text string) string {
bytes := unsafe.Slice(unsafe.StringData(text), len(text))
start := 0
end := len(bytes)
// TODO: optimize this.
text = strings.ReplaceAll(text, "```html", "")
text = strings.ReplaceAll(text, "```markdown", "")
text = strings.ReplaceAll(text, "```", "")
// Remove the last line if it's empty.
// This is a hack to avoid the model output a empty line.
// E.g. category: tech\n
if end > 0 && bytes[end-1] == '\n' {
end--
return text
}
var audioKey = func(transcript, ext string) string {
hash := hashutil.Sum64(transcript)
file := strconv.FormatUint(hash, 10) + "." + ext
return "podcasts/" + file
}
func (r *rewriter) transformPodcast(ctx context.Context, toPodcast *ToPodcast, sourceText string) (url string, err error) {
transcript, err := r.generateTranscript(ctx, toPodcast, sourceText)
if err != nil {
return "", errors.Wrap(err, "generate podcast transcript")
}
// Remove the html backticks.
if end-start >= 7 && string(bytes[start:start+7]) == "```html" {
start += 7
}
if end-start >= 3 && string(bytes[end-3:end]) == "```" {
end -= 3
audioKey := audioKey(transcript, "wav")
url, err = r.Dependencies().ObjectStorage.Get(ctx, audioKey)
switch {
case err == nil:
// May canceled at last time by reload, fast return.
return url, nil
case errors.Is(err, object.ErrNotFound):
// Not found, generate new audio.
default:
return "", errors.Wrap(err, "get audio")
}
// If no changes, return the original string.
if start == 0 && end == len(bytes) {
return text
audioStream, err := r.generateAudio(ctx, toPodcast, transcript)
if err != nil {
return "", errors.Wrap(err, "generate podcast audio")
}
defer func() {
if closeErr := audioStream.Close(); closeErr != nil {
err = errors.Wrap(err, "close audio stream")
}
}()
url, err = r.Dependencies().ObjectStorage.Put(ctx, audioKey, audioStream, "audio/wav")
if err != nil {
return "", errors.Wrap(err, "store podcast audio")
}
// Only copy one time.
return string(bytes[start:end])
return url, nil
}
func (r *rewriter) generateTranscript(ctx context.Context, toPodcast *ToPodcast, sourceText string) (string, error) {
transcript, err := r.Dependencies().LLMFactory.Get(toPodcast.LLM).
String(ctx, []string{toPodcast.transcriptPrompt, sourceText})
if err != nil {
return "", errors.Wrap(err, "llm completion")
}
return toPodcast.speakersDesc +
"\n\nFollowed by the actual dialogue script:\n" +
transcript, nil
}
func (r *rewriter) generateAudio(ctx context.Context, toPodcast *ToPodcast, transcript string) (io.ReadCloser, error) {
audioStream, err := r.Dependencies().LLMFactory.Get(toPodcast.TTSLLM).
WAV(ctx, transcript, toPodcast.speakers)
if err != nil {
return nil, errors.Wrap(err, "calling tts llm")
}
return audioStream, nil
}
type mockRewriter struct {

View File

@@ -2,6 +2,8 @@ package rewrite
import (
"context"
"io"
"strings"
"testing"
. "github.com/onsi/gomega"
@@ -12,6 +14,7 @@ import (
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/llm"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/storage/object"
"github.com/glidea/zenfeed/pkg/test"
)
@@ -19,8 +22,9 @@ func TestLabels(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct {
config *Config
llmMock func(m *mock.Mock)
config *Config
llmMock func(m *mock.Mock)
objectStorageMock func(m *mock.Mock)
}
type whenDetail struct {
inputLabels model.Labels
@@ -44,6 +48,7 @@ func TestLabels(t *testing.T) {
SkipTooShortThreshold: ptr.To(10),
Transform: &Transform{
ToText: &ToText{
Type: ToTextTypePrompt,
LLM: "mock-llm",
Prompt: "{{ .category }}", // Using a simple template for testing
},
@@ -79,6 +84,7 @@ func TestLabels(t *testing.T) {
SkipTooShortThreshold: ptr.To(10),
Transform: &Transform{
ToText: &ToText{
Type: ToTextTypePrompt,
LLM: "mock-llm",
Prompt: "{{ .category }}",
},
@@ -148,6 +154,7 @@ func TestLabels(t *testing.T) {
SkipTooShortThreshold: ptr.To(10),
Transform: &Transform{
ToText: &ToText{
Type: ToTextTypePrompt,
LLM: "mock-llm",
Prompt: "{{ .category }}",
promptRendered: "Analyze the content and categorize it...",
@@ -170,7 +177,7 @@ func TestLabels(t *testing.T) {
},
ThenExpected: thenExpected{
outputLabels: nil,
err: errors.New("transform text: llm completion: LLM failed"),
err: errors.New("transform: llm completion: LLM failed"),
isErr: true,
},
},
@@ -186,6 +193,7 @@ func TestLabels(t *testing.T) {
SkipTooShortThreshold: ptr.To(10),
Transform: &Transform{
ToText: &ToText{
Type: ToTextTypePrompt,
LLM: "mock-llm",
Prompt: "{{ .category }}",
promptRendered: "Analyze the content and categorize it...",
@@ -216,22 +224,163 @@ func TestLabels(t *testing.T) {
isErr: false,
},
},
{
Scenario: "Successfully generate podcast from content",
Given: "a rule to convert content to a podcast with all dependencies mocked to succeed",
When: "processing labels with content to be converted to a podcast",
Then: "should return labels with a new podcast_url label",
GivenDetail: givenDetail{
config: &Config{
{
SourceLabel: model.LabelContent,
Transform: &Transform{
ToPodcast: &ToPodcast{
LLM: "mock-llm-transcript",
TTSLLM: "mock-llm-tts",
Speakers: []Speaker{{Name: "narrator", Voice: "alloy"}},
},
},
Action: ActionCreateOrUpdateLabel,
Label: "podcast_url",
},
},
llmMock: func(m *mock.Mock) {
m.On("String", mock.Anything, mock.Anything).Return("This is the podcast script.", nil).Once()
m.On("WAV", mock.Anything, mock.Anything, mock.AnythingOfType("[]llm.Speaker")).
Return(io.NopCloser(strings.NewReader("fake audio data")), nil).Once()
},
objectStorageMock: func(m *mock.Mock) {
m.On("Put", mock.Anything, mock.AnythingOfType("string"), mock.Anything, "audio/wav").
Return("http://storage.example.com/podcast.wav", nil).Once()
m.On("Get", mock.Anything, mock.AnythingOfType("string")).Return("", object.ErrNotFound).Once()
},
},
WhenDetail: whenDetail{
inputLabels: model.Labels{
{Key: model.LabelContent, Value: "This is a long article to be converted into a podcast."},
},
},
ThenExpected: thenExpected{
outputLabels: model.Labels{
{Key: model.LabelContent, Value: "This is a long article to be converted into a podcast."},
{Key: "podcast_url", Value: "http://storage.example.com/podcast.wav"},
},
isErr: false,
},
},
{
Scenario: "Fail podcast generation due to transcription LLM error",
Given: "a rule to convert content to a podcast, but the transcription LLM is mocked to fail",
When: "processing labels",
Then: "should return an error related to transcription failure",
GivenDetail: givenDetail{
config: &Config{
{
SourceLabel: model.LabelContent,
Transform: &Transform{
ToPodcast: &ToPodcast{LLM: "mock-llm-transcript", Speakers: []Speaker{{Name: "narrator", Voice: "alloy"}}},
},
Action: ActionCreateOrUpdateLabel, Label: "podcast_url",
},
},
llmMock: func(m *mock.Mock) {
m.On("String", mock.Anything, mock.Anything).Return("", errors.New("transcript failed")).Once()
},
},
WhenDetail: whenDetail{inputLabels: model.Labels{{Key: model.LabelContent, Value: "article"}}},
ThenExpected: thenExpected{
outputLabels: nil,
err: errors.New("transform: generate podcast transcript: llm completion: transcript failed"),
isErr: true,
},
},
{
Scenario: "Fail podcast generation due to TTS LLM error",
Given: "a rule to convert content to a podcast, but the TTS LLM is mocked to fail",
When: "processing labels",
Then: "should return an error related to TTS failure",
GivenDetail: givenDetail{
config: &Config{
{
SourceLabel: model.LabelContent,
Transform: &Transform{
ToPodcast: &ToPodcast{LLM: "mock-llm-transcript", TTSLLM: "mock-llm-tts", Speakers: []Speaker{{Name: "narrator", Voice: "alloy"}}},
},
Action: ActionCreateOrUpdateLabel, Label: "podcast_url",
},
},
llmMock: func(m *mock.Mock) {
m.On("String", mock.Anything, mock.Anything).Return("script", nil).Once()
m.On("WAV", mock.Anything, mock.Anything, mock.Anything).Return(nil, errors.New("tts failed")).Once()
},
objectStorageMock: func(m *mock.Mock) {
m.On("Get", mock.Anything, mock.AnythingOfType("string")).Return("", object.ErrNotFound).Once()
},
},
WhenDetail: whenDetail{inputLabels: model.Labels{{Key: model.LabelContent, Value: "article"}}},
ThenExpected: thenExpected{
outputLabels: nil,
err: errors.New("transform: generate podcast audio: calling tts llm: tts failed"),
isErr: true,
},
},
{
Scenario: "Fail podcast generation due to object storage error",
Given: "a rule to convert content to a podcast, but object storage is mocked to fail",
When: "processing labels",
Then: "should return an error related to storage failure",
GivenDetail: givenDetail{
config: &Config{
{
SourceLabel: model.LabelContent,
Transform: &Transform{
ToPodcast: &ToPodcast{LLM: "mock-llm-transcript", TTSLLM: "mock-llm-tts", Speakers: []Speaker{{Name: "narrator", Voice: "alloy"}}},
},
Action: ActionCreateOrUpdateLabel, Label: "podcast_url",
},
},
llmMock: func(m *mock.Mock) {
m.On("String", mock.Anything, mock.Anything).Return("script", nil).Once()
m.On("WAV", mock.Anything, mock.Anything, mock.Anything).Return(io.NopCloser(strings.NewReader("fake audio")), nil).Once()
},
objectStorageMock: func(m *mock.Mock) {
m.On("Put", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return("", errors.New("storage failed")).Once()
m.On("Get", mock.Anything, mock.AnythingOfType("string")).Return("", object.ErrNotFound).Once()
},
},
WhenDetail: whenDetail{inputLabels: model.Labels{{Key: model.LabelContent, Value: "article"}}},
ThenExpected: thenExpected{
outputLabels: nil,
err: errors.New("transform: store podcast audio: storage failed"),
isErr: true,
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
var mockLLMFactory llm.Factory
var mockInstance *mock.Mock // Store the mock instance for assertion
// Create mock factory and capture the mock.Mock instance.
mockOption := component.MockOption(func(m *mock.Mock) {
mockInstance = m // Capture the mock instance.
var mockLLMInstance *mock.Mock
llmMockOption := component.MockOption(func(m *mock.Mock) {
mockLLMInstance = m
if tt.GivenDetail.llmMock != nil {
tt.GivenDetail.llmMock(m)
}
})
mockLLMFactory, err := llm.NewFactory("", nil, llm.FactoryDependencies{}, mockOption) // Use the factory directly with the option
mockLLMFactory, err := llm.NewFactory("", nil, llm.FactoryDependencies{}, llmMockOption)
Expect(err).NotTo(HaveOccurred())
var mockObjectStorage object.Storage
var mockObjectStorageInstance *mock.Mock
objectStorageMockOption := component.MockOption(func(m *mock.Mock) {
mockObjectStorageInstance = m
if tt.GivenDetail.objectStorageMock != nil {
tt.GivenDetail.objectStorageMock(m)
}
})
mockObjectStorageFactory := object.NewFactory(objectStorageMockOption)
mockObjectStorage, err = mockObjectStorageFactory.New("test", nil, object.Dependencies{})
Expect(err).NotTo(HaveOccurred())
// Manually validate config to compile regex and render templates.
@@ -248,7 +397,8 @@ func TestLabels(t *testing.T) {
Instance: "test",
Config: tt.GivenDetail.config,
Dependencies: Dependencies{
LLMFactory: mockLLMFactory, // Pass the mock factory
LLMFactory: mockLLMFactory, // Pass the mock factory
ObjectStorage: mockObjectStorage,
},
}),
}
@@ -276,10 +426,12 @@ func TestLabels(t *testing.T) {
Expect(outputLabels).To(Equal(tt.ThenExpected.outputLabels))
}
// Verify LLM calls if stubs were provided.
if tt.GivenDetail.llmMock != nil && mockInstance != nil {
// Assert expectations on the captured mock instance.
mockInstance.AssertExpectations(t)
// Verify mock calls if stubs were provided.
if tt.GivenDetail.llmMock != nil && mockLLMInstance != nil {
mockLLMInstance.AssertExpectations(t)
}
if tt.GivenDetail.objectStorageMock != nil && mockObjectStorageInstance != nil {
mockObjectStorageInstance.AssertExpectations(t)
}
})
}

View File

@@ -55,7 +55,7 @@ func (r *periodic) Run() (err error) {
end := time.Date(today.Year(), today.Month(), today.Day(),
config.end.Hour(), config.end.Minute(), 0, 0, today.Location())
buffer := 20 * time.Minute
buffer := 30 * time.Minute
endPlusBuffer := end.Add(buffer)
if now.Before(end) || now.After(endPlusBuffer) {
return
@@ -77,7 +77,7 @@ func (r *periodic) Run() (err error) {
return nil
case now := <-tick.C:
iter(now)
tick.Reset(3 * time.Minute)
tick.Reset(5 * time.Minute)
}
}
}
@@ -119,6 +119,12 @@ func (r *periodic) execute(ctx context.Context, now time.Time) error {
return nil
}
// Attach labels to feeds.
for _, feed := range feeds {
feed.Labels = append(feed.Labels, config.labels...)
feed.Labels.EnsureSorted()
}
// Notify.
r.Dependencies().Out <- &Result{
Rule: config.Name,

View File

@@ -18,11 +18,11 @@ package rule
import (
"strings"
"time"
"unicode/utf8"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/storage/feed"
"github.com/glidea/zenfeed/pkg/storage/feed/block"
)
@@ -38,6 +38,8 @@ type Config struct {
Query string
Threshold float32
LabelFilters []string
Labels map[string]string
labels model.Labels
// Periodic type.
EveryDay string // e.g. "00:00~23:59", or "-22:00~7:00" (yesterday 22:00 to today 07:00)
@@ -58,15 +60,15 @@ func (c *Config) Validate() error { //nolint:cyclop,gocognit
if c.Name == "" {
return errors.New("name is required")
}
if c.Query != "" && utf8.RuneCountInString(c.Query) < 5 {
return errors.New("query must be at least 5 characters")
}
if c.Threshold == 0 {
c.Threshold = 0.6
c.Threshold = 0.5
}
if c.Threshold < 0 || c.Threshold > 1 {
return errors.New("threshold must be between 0 and 1")
}
if len(c.Labels) > 0 {
c.labels.FromMap(c.Labels)
}
if c.EveryDay != "" && c.WatchInterval != 0 {
return errors.New("every_day and watch_interval cannot both be set")
}

View File

@@ -101,6 +101,12 @@ func (r *watch) execute(ctx context.Context, start, end time.Time) error {
return nil
}
// Attach labels to feeds.
for _, feed := range feeds {
feed.Labels = append(feed.Labels, config.labels...)
feed.Labels.EnsureSorted()
}
// Split feeds by start time.
feedsByStart := make(map[time.Time][]*block.FeedVO) // Start time -> feeds.
for _, feed := range feeds {

View File

@@ -58,8 +58,9 @@ func (c *Config) From(app *config.App) *Config {
Query: r.Query,
Threshold: r.Threshold,
LabelFilters: r.LabelFilters,
Labels: r.Labels,
EveryDay: r.EveryDay,
WatchInterval: r.WatchInterval,
WatchInterval: time.Duration(r.WatchInterval),
}
}

View File

@@ -66,20 +66,21 @@ func (c *Config) From(app *config.App) {
c.Scrapers = make([]scraper.Config, len(app.Scrape.Sources))
for i := range app.Scrape.Sources {
c.Scrapers[i] = scraper.Config{
Past: app.Scrape.Past,
Past: time.Duration(app.Scrape.Past),
Name: app.Scrape.Sources[i].Name,
Interval: app.Scrape.Sources[i].Interval,
Interval: time.Duration(app.Scrape.Sources[i].Interval),
Labels: model.Labels{},
}
c.Scrapers[i].Labels.FromMap(app.Scrape.Sources[i].Labels)
if c.Scrapers[i].Interval <= 0 {
c.Scrapers[i].Interval = app.Scrape.Interval
c.Scrapers[i].Interval = time.Duration(app.Scrape.Interval)
}
if app.Scrape.Sources[i].RSS != nil {
c.Scrapers[i].RSS = &scraper.ScrapeSourceRSS{
URL: app.Scrape.Sources[i].RSS.URL,
RSSHubEndpoint: app.Scrape.RSSHubEndpoint,
RSSHubRoutePath: app.Scrape.Sources[i].RSS.RSSHubRoutePath,
RSSHubAccessKey: app.Scrape.RSSHubAccessKey,
}
}
}
@@ -216,6 +217,10 @@ func (m *manager) reload(config *Config) (err error) {
func (m *manager) runOrRestartScrapers(config *Config, newScrapers map[string]scraper.Scraper) error {
for i := range config.Scrapers {
c := &config.Scrapers[i]
if err := c.Validate(); err != nil {
return errors.Wrapf(err, "validate scraper %s", c.Name)
}
if err := m.runOrRestartScraper(c, newScrapers); err != nil {
return errors.Wrapf(err, "run or restart scraper %s", c.Name)
}

View File

@@ -33,6 +33,7 @@ type ScrapeSourceRSS struct {
URL string
RSSHubEndpoint string
RSSHubRoutePath string
RSSHubAccessKey string
}
func (c *ScrapeSourceRSS) Validate() error {
@@ -46,9 +47,22 @@ func (c *ScrapeSourceRSS) Validate() error {
return errors.New("URL must be a valid HTTP/HTTPS URL")
}
// Append access key as query parameter if provided
c.appendAccessKey()
return nil
}
func (c *ScrapeSourceRSS) appendAccessKey() {
if c.RSSHubEndpoint != "" && c.RSSHubAccessKey != "" && !strings.Contains(c.URL, "key=") {
if strings.Contains(c.URL, "?") {
c.URL += "&key=" + c.RSSHubAccessKey
} else {
c.URL += "?key=" + c.RSSHubAccessKey
}
}
}
// --- Factory code block ---
func newRSSReader(config *ScrapeSourceRSS) (reader, error) {
if err := config.Validate(); err != nil {
@@ -65,7 +79,6 @@ func newRSSReader(config *ScrapeSourceRSS) (reader, error) {
}
// --- Implementation code block ---
type rssReader struct {
config *ScrapeSourceRSS
client client

View File

@@ -122,6 +122,55 @@ func TestNewRSS(t *testing.T) {
},
},
},
{
Scenario: "Valid Configuration - RSSHub with Access Key",
Given: "a valid configuration with RSSHub details and access key",
When: "creating a new RSS reader",
Then: "should succeed, construct the URL with access key, and return a valid reader",
GivenDetail: givenDetail{
config: &ScrapeSourceRSS{
RSSHubEndpoint: "http://rsshub.app/",
RSSHubRoutePath: "/_/test",
RSSHubAccessKey: "testkey",
},
},
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
wantErr: false,
validateFunc: func(t *testing.T, r reader) {
Expect(r).NotTo(BeNil())
rssReader, ok := r.(*rssReader)
Expect(ok).To(BeTrue())
Expect(rssReader.config.URL).To(Equal("http://rsshub.app/_/test?key=testkey"))
Expect(rssReader.config.RSSHubEndpoint).To(Equal("http://rsshub.app/"))
Expect(rssReader.config.RSSHubRoutePath).To(Equal("/_/test"))
Expect(rssReader.config.RSSHubAccessKey).To(Equal("testkey"))
},
},
},
{
Scenario: "Valid Configuration - URL with Access Key",
Given: "a valid configuration with URL and access key",
When: "creating a new RSS reader",
Then: "should succeed, append access key to URL, and return a valid reader",
GivenDetail: givenDetail{
config: &ScrapeSourceRSS{
URL: "http://example.com/feed",
RSSHubAccessKey: "testkey",
},
},
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
wantErr: false,
validateFunc: func(t *testing.T, r reader) {
Expect(r).NotTo(BeNil())
rssReader, ok := r.(*rssReader)
Expect(ok).To(BeTrue())
Expect(rssReader.config.URL).To(Equal("http://example.com/feed"))
Expect(rssReader.config.RSSHubAccessKey).To(Equal("testkey"))
},
},
},
}
// --- Run tests ---

View File

@@ -55,7 +55,7 @@ const maxPast = 15 * 24 * time.Hour
func (c *Config) Validate() error {
if c.Past <= 0 {
c.Past = 3 * timeutil.Day
c.Past = timeutil.Day
}
if c.Past > maxPast {
c.Past = maxPast
@@ -69,6 +69,11 @@ func (c *Config) Validate() error {
if c.Name == "" {
return errors.New("name cannot be empty")
}
if c.RSS != nil {
if err := c.RSS.Validate(); err != nil {
return errors.Wrap(err, "invalid RSS config")
}
}
return nil
}
@@ -208,10 +213,11 @@ func (s *scraper) fillIDs(feeds []*model.Feed) []*model.Feed {
for _, feed := range feeds {
// We can not use the pub time to join the hash,
// because the pub time is dynamic for some sources.
//
// title may be changed for some sources... so...
source := feed.Labels.Get(model.LabelSource)
title := feed.Labels.Get(model.LabelTitle)
link := feed.Labels.Get(model.LabelLink)
feed.ID = hashutil.Sum64s([]string{source, title, link})
feed.ID = hashutil.Sum64s([]string{source, link})
}
return feeds
@@ -226,7 +232,7 @@ func (s *scraper) filterExists(ctx context.Context, feeds []*model.Feed) (filter
appendToResult := func(feed *model.Feed) {
key := keyPrefix + strconv.FormatUint(feed.ID, 10)
value := timeutil.Format(feed.Time)
if err := s.Dependencies().KVStorage.Set(ctx, key, value, ttl); err != nil {
if err := s.Dependencies().KVStorage.Set(ctx, []byte(key), []byte(value), ttl); err != nil {
log.Error(ctx, err, "set last try store time")
}
filtered = append(filtered, feed)
@@ -235,7 +241,7 @@ func (s *scraper) filterExists(ctx context.Context, feeds []*model.Feed) (filter
for _, feed := range feeds {
key := keyPrefix + strconv.FormatUint(feed.ID, 10)
lastTryStored, err := s.Dependencies().KVStorage.Get(ctx, key)
lastTryStored, err := s.Dependencies().KVStorage.Get(ctx, []byte(key))
switch {
default:
log.Error(ctx, err, "get last stored time, fallback to continue writing")
@@ -245,7 +251,7 @@ func (s *scraper) filterExists(ctx context.Context, feeds []*model.Feed) (filter
appendToResult(feed)
case err == nil:
t, err := timeutil.Parse(lastTryStored)
t, err := timeutil.Parse(string(lastTryStored))
if err != nil {
log.Error(ctx, err, "parse last try stored time, fallback to continue writing")
appendToResult(feed)

View File

@@ -53,8 +53,8 @@ func TestConfig_Validate(t *testing.T) {
ThenExpected: thenExpected{
expectedConfig: &Config{
Name: "test",
Past: 3 * timeutil.Day, // Default Past
Interval: time.Hour, // Default/Minimum Interval
Past: timeutil.Day, // Default Past
Interval: time.Hour, // Default/Minimum Interval
},
isErr: false,
},
@@ -89,7 +89,7 @@ func TestConfig_Validate(t *testing.T) {
ThenExpected: thenExpected{
expectedConfig: &Config{
Name: "test",
Past: 3 * timeutil.Day, // Default Past
Past: timeutil.Day, // Default Past
Interval: 10 * time.Minute, // Minimum Interval
},
isErr: false,
@@ -103,7 +103,7 @@ func TestConfig_Validate(t *testing.T) {
GivenDetail: givenDetail{
config: &Config{
Name: "test",
Past: 24 * time.Hour,
Past: 4 * time.Hour,
Interval: 30 * time.Minute,
},
},
@@ -111,7 +111,7 @@ func TestConfig_Validate(t *testing.T) {
ThenExpected: thenExpected{
expectedConfig: &Config{
Name: "test",
Past: 24 * time.Hour,
Past: 4 * time.Hour,
Interval: 30 * time.Minute,
},
isErr: false,
@@ -244,7 +244,7 @@ func TestNew(t *testing.T) {
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
isErr: true,
wantErrMsg: "creating source: invalid RSS config: URL must be a valid HTTP/HTTPS URL", // Error from newRSSReader via newReader
wantErrMsg: "invalid RSS config: URL must be a valid HTTP/HTTPS URL", // Error from newRSSReader via newReader
},
},
{
@@ -264,7 +264,7 @@ func TestNew(t *testing.T) {
WhenDetail: whenDetail{},
ThenExpected: thenExpected{
isErr: true,
wantErrMsg: "creating source: source not supported", // Error from newReader
wantErrMsg: "source not supported", // Error from newReader
},
},
}

View File

@@ -26,7 +26,6 @@ import (
"runtime"
"slices"
"strconv"
"strings"
"sync"
"sync/atomic"
"time"
@@ -277,47 +276,20 @@ type QueryOptions struct {
Query string
Threshold float32
LabelFilters []string
labelFilters []LabelFilter
labelFilters model.LabelFilters
Limit int
Start, End time.Time
}
var (
LabelFilterEqual = "="
LabelFilterNotEqual = "!="
NewLabelFilter = func(key, value string, eq bool) string {
if eq {
return fmt.Sprintf("%s%s%s", key, LabelFilterEqual, value)
}
return fmt.Sprintf("%s%s%s", key, LabelFilterNotEqual, value)
}
ParseLabelFilter = func(filter string) (LabelFilter, error) {
eq := false
parts := strings.Split(filter, LabelFilterNotEqual)
if len(parts) != 2 {
parts = strings.Split(filter, LabelFilterEqual)
eq = true
}
if len(parts) != 2 {
return LabelFilter{}, errors.New("invalid label filter")
}
return LabelFilter{Label: parts[0], Value: parts[1], Equal: eq}, nil
}
)
func (q *QueryOptions) Validate() error { //nolint:cyclop
if q.Threshold < 0 || q.Threshold > 1 {
return errors.New("threshold must be between 0 and 1")
}
for _, labelFilter := range q.LabelFilters {
if labelFilter == "" {
for _, s := range q.LabelFilters {
if s == "" {
return errors.New("label filter is required")
}
filter, err := ParseLabelFilter(labelFilter)
filter, err := model.NewLabelFilter(s)
if err != nil {
return errors.Wrap(err, "parse label filter")
}
@@ -368,13 +340,6 @@ func (q *QueryOptions) HitTimeRangeCondition(b Block) bool {
return queryAsBase || blockAsBase
}
// LabelFilter defines the matcher for an item.
type LabelFilter struct {
Label string
Equal bool
Value string
}
// --- Factory code block ---
type Factory component.Factory[Block, Config, Dependencies]
@@ -523,9 +488,9 @@ type block struct {
coldLoaded bool
}
func (b *block) Run() error {
func (b *block) Run() (err error) {
ctx := telemetry.StartWith(b.Context(), append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "Run")...)
defer func() { telemetry.End(ctx, nil) }()
defer func() { telemetry.End(ctx, err) }()
// Maintain metrics.
go b.maintainMetrics(ctx)
@@ -715,9 +680,9 @@ func (b *block) Query(ctx context.Context, query QueryOptions) (feeds []*FeedVO,
return result.Slice(), nil
}
func (b *block) Exists(ctx context.Context, id uint64) (bool, error) {
func (b *block) Exists(ctx context.Context, id uint64) (exists bool, err error) {
ctx = telemetry.StartWith(ctx, append(b.TelemetryLabels(), telemetrymodel.KeyOperation, "Exists")...)
defer func() { telemetry.End(ctx, nil) }()
defer func() { telemetry.End(ctx, err) }()
// Ensure the block is loaded.
if err := b.ensureLoaded(ctx); err != nil {
@@ -1228,14 +1193,14 @@ func (b *block) applyFilters(ctx context.Context, query *QueryOptions) (res filt
return b.mergeFilterResults(labelsResult, vectorsResult), nil
}
func (b *block) applyLabelFilters(ctx context.Context, filters []LabelFilter) filterResult {
func (b *block) applyLabelFilters(ctx context.Context, filters model.LabelFilters) filterResult {
if len(filters) == 0 {
return matchedAllFilterResult
}
var allIDs map[uint64]struct{}
for _, filter := range filters {
ids := b.invertedIndex.Search(ctx, filter.Label, filter.Equal, filter.Value)
ids := b.invertedIndex.Search(ctx, filter)
if len(ids) == 0 {
return matchedNothingFilterResult
}
@@ -1317,7 +1282,7 @@ func (b *block) mergeFilterResults(x, y filterResult) filterResult {
}
func (b *block) fillEmbedding(ctx context.Context, feeds []*model.Feed) ([]*chunk.Feed, error) {
embedded := make([]*chunk.Feed, len(feeds))
embedded := make([]*chunk.Feed, 0, len(feeds))
llm := b.Dependencies().LLMFactory.Get(b.Config().embeddingLLM)
var wg sync.WaitGroup
var mu sync.Mutex
@@ -1336,16 +1301,21 @@ func (b *block) fillEmbedding(ctx context.Context, feeds []*model.Feed) ([]*chun
}
mu.Lock()
embedded[i] = &chunk.Feed{
embedded = append(embedded, &chunk.Feed{
Feed: feed,
Vectors: vectors,
}
})
mu.Unlock()
}(i, feed)
}
wg.Wait()
if len(errs) > 0 {
return nil, errs[0]
switch len(errs) {
case 0:
case len(feeds):
return nil, errs[0] // All failed.
default:
log.Error(ctx, errors.Wrap(errs[0], "fill embedding"), "error_count", len(errs))
}
return embedded, nil

View File

@@ -24,7 +24,7 @@ type Index interface {
index.Codec
// Search returns item IDs matching the given label and value.
Search(ctx context.Context, label string, eq bool, value string) (ids map[uint64]struct{})
Search(ctx context.Context, matcher model.LabelFilter) (ids map[uint64]struct{})
// Add adds item to the index.
// If label or value in labels is empty, it will be ignored.
// If value is too long, it will be ignored,
@@ -88,17 +88,17 @@ type idx struct {
mu sync.RWMutex
}
func (idx *idx) Search(ctx context.Context, label string, eq bool, value string) (ids map[uint64]struct{}) {
func (idx *idx) Search(ctx context.Context, matcher model.LabelFilter) (ids map[uint64]struct{}) {
ctx = telemetry.StartWith(ctx, append(idx.TelemetryLabels(), telemetrymodel.KeyOperation, "Search")...)
defer func() { telemetry.End(ctx, nil) }()
idx.mu.RLock()
defer idx.mu.RUnlock()
if value == "" {
return idx.searchEmptyValue(label, eq)
if matcher.Value == "" {
return idx.searchEmptyValue(matcher.Label, matcher.Equal)
}
return idx.searchNonEmptyValue(label, eq, value)
return idx.searchNonEmptyValue(matcher)
}
func (idx *idx) Add(ctx context.Context, id uint64, labels model.Labels) {
@@ -198,16 +198,16 @@ func (idx *idx) searchEmptyValue(label string, eq bool) map[uint64]struct{} {
// searchNonEmptyValue handles the search logic when the target value is not empty.
// If eq is true, it returns IDs that have the exact label-value pair.
// If eq is false, it returns IDs that *do not* have the exact label-value pair.
func (idx *idx) searchNonEmptyValue(label string, eq bool, value string) map[uint64]struct{} {
func (idx *idx) searchNonEmptyValue(matcher model.LabelFilter) map[uint64]struct{} {
// Get the map of values for the given label.
values, labelExists := idx.m[label]
values, labelExists := idx.m[matcher.Label]
// If equal (eq), find the exact match.
if eq {
if matcher.Equal {
if !labelExists {
return make(map[uint64]struct{}) // Label doesn't exist.
}
ids, valueExists := values[value]
ids, valueExists := values[matcher.Value]
if !valueExists {
return make(map[uint64]struct{}) // Value doesn't exist for this label.
}
@@ -221,7 +221,7 @@ func (idx *idx) searchNonEmptyValue(label string, eq bool, value string) map[uin
resultIDs := maps.Clone(idx.ids)
if labelExists {
// If the specific label-value pair exists, remove its associated IDs.
if matchingIDs, valueExists := values[value]; valueExists {
if matchingIDs, valueExists := values[matcher.Value]; valueExists {
for id := range matchingIDs {
delete(resultIDs, id)
}
@@ -413,8 +413,8 @@ type mockIndex struct {
component.Mock
}
func (m *mockIndex) Search(ctx context.Context, label string, eq bool, value string) (ids map[uint64]struct{}) {
args := m.Called(ctx, label, eq, value)
func (m *mockIndex) Search(ctx context.Context, matcher model.LabelFilter) (ids map[uint64]struct{}) {
args := m.Called(ctx, matcher)
return args.Get(0).(map[uint64]struct{})
}

View File

@@ -118,9 +118,7 @@ func TestSearch(t *testing.T) {
setupLabels map[uint64]model.Labels
}
type whenDetail struct {
searchLabel string
eq bool
searchValue string
matcher model.LabelFilter
}
type thenExpected struct {
want []uint64
@@ -140,9 +138,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "category",
searchValue: "tech",
eq: true,
matcher: model.LabelFilter{
Label: "category",
Value: "tech",
Equal: true,
},
},
ThenExpected: thenExpected{
want: []uint64{1, 2},
@@ -159,9 +159,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "invalid",
searchValue: "value",
eq: true,
matcher: model.LabelFilter{
Label: "invalid",
Value: "value",
Equal: true,
},
},
ThenExpected: thenExpected{
want: nil,
@@ -178,9 +180,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "category",
searchValue: "invalid",
eq: true,
matcher: model.LabelFilter{
Label: "category",
Value: "invalid",
Equal: true,
},
},
ThenExpected: thenExpected{
want: nil,
@@ -200,9 +204,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "category",
searchValue: "tech",
eq: false,
matcher: model.LabelFilter{
Label: "category",
Value: "tech",
Equal: false,
},
},
ThenExpected: thenExpected{
want: []uint64{2},
@@ -220,9 +226,11 @@ func TestSearch(t *testing.T) {
},
},
WhenDetail: whenDetail{
searchLabel: "invalid",
searchValue: "value",
eq: false,
matcher: model.LabelFilter{
Label: "invalid",
Value: "value",
Equal: false,
},
},
ThenExpected: thenExpected{
want: []uint64{1, 2},
@@ -240,7 +248,7 @@ func TestSearch(t *testing.T) {
}
// When.
result := idx.Search(context.Background(), tt.WhenDetail.searchLabel, tt.WhenDetail.eq, tt.WhenDetail.searchValue)
result := idx.Search(context.Background(), tt.WhenDetail.matcher)
// Then.
if tt.ThenExpected.want == nil {

View File

@@ -22,6 +22,7 @@ import (
"reflect"
"strconv"
"sync"
"sync/atomic"
"time"
"github.com/benbjohnson/clock"
@@ -98,9 +99,9 @@ func (c *Config) Validate() error {
func (c *Config) From(app *config.App) {
*c = Config{
Dir: app.Storage.Dir,
Retention: app.Storage.Feed.Retention,
BlockDuration: app.Storage.Feed.BlockDuration,
FlushInterval: app.Storage.Feed.FlushInterval,
Retention: time.Duration(app.Storage.Feed.Retention),
BlockDuration: time.Duration(app.Storage.Feed.BlockDuration),
FlushInterval: time.Duration(app.Storage.Feed.FlushInterval),
EmbeddingLLM: app.Storage.Feed.EmbeddingLLM,
}
}
@@ -578,10 +579,14 @@ func (s *storage) blockDependencies() block.Dependencies {
}
func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Feed, error) {
rewritten := make([]*model.Feed, 0, len(feeds))
var wg sync.WaitGroup
var errs []error
var mu sync.Mutex
var (
rewritten = make([]*model.Feed, 0, len(feeds))
wg sync.WaitGroup
mu sync.Mutex
errs []error
dropped atomic.Int32
)
for _, item := range feeds { // TODO: Limit the concurrency & goroutine number.
wg.Add(1)
go func(item *model.Feed) {
@@ -596,6 +601,7 @@ func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Fe
}
if len(labels) == 0 {
log.Debug(ctx, "drop feed", "id", item.ID)
dropped.Add(1)
return // Drop empty labels.
}
@@ -607,8 +613,13 @@ func (s *storage) rewrite(ctx context.Context, feeds []*model.Feed) ([]*model.Fe
}(item)
}
wg.Wait()
if len(errs) > 0 {
return nil, errs[0]
switch len(errs) {
case 0:
case len(feeds) - int(dropped.Load()):
return nil, errs[0] // All failed.
default:
log.Error(ctx, errors.Wrap(errs[0], "rewrite feeds"), "error_count", len(errs))
}
return rewritten, nil

View File

@@ -1,520 +0,0 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package feed
// import (
// "context"
// "os"
// "testing"
// "time"
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
//
// "github.com/benbjohnson/clock"
// . "github.com/onsi/gomega"
// "github.com/stretchr/testify/mock"
// "github.com/glidea/zenfeed/pkg/config"
// "github.com/glidea/zenfeed/pkg/storage/feed/block"
// "github.com/glidea/zenfeed/pkg/storage/feed/block/chunk"
// "github.com/glidea/zenfeed/pkg/test"
// timeutil "github.com/glidea/zenfeed/pkg/util/time"
// )
// func TestNew(t *testing.T) {
// RegisterTestingT(t)
// type givenDetail struct {
// now time.Time
// blocksOnDisk []string // Block directory names in format "2006-01-02T15:04:05Z-2006-01-02T15:04:05Z"
// }
// type whenDetail struct {
// app *config.App
// }
// type thenExpected struct {
// storage storage
// storageHotLen int
// storageColdLen int
// blockCalls []func(obj *mock.Mock)
// }
// tests := []test.Case[givenDetail, whenDetail, thenExpected]{
// {
// Scenario: "Create a new storage from an empty directory",
// Given: "just mock a time",
// When: "call New with a config with a data directory",
// Then: "should return a new storage and a hot block created",
// GivenDetail: givenDetail{
// now: timeutil.MustParse("2025-03-03T10:00:00Z"),
// },
// WhenDetail: whenDetail{
// app: &config.App{
// DB: config.DB{
// Dir: "/tmp/TestNew",
// },
// },
// },
// ThenExpected: thenExpected{
// storage: storage{
// config: &Config{
// Dir: "/tmp/TestNew",
// },
// },
// storageHotLen: 1,
// storageColdLen: 0,
// },
// },
// {
// Scenario: "Create a storage from existing directory with blocks",
// Given: "existing blocks on disk",
// GivenDetail: givenDetail{
// now: timeutil.MustParse("2025-03-03T10:00:00Z"),
// blocksOnDisk: []string{
// "2025-03-02T10:00:00Z ~ 2025-03-03T10:00:00Z", // Hot block
// "2025-03-01T10:00:00Z ~ 2025-03-02T10:00:00Z", // Cold block
// "2025-02-28T10:00:00Z ~ 2025-03-01T10:00:00Z", // Cold block
// },
// },
// When: "call New with a config with existing data directory",
// WhenDetail: whenDetail{
// app: &config.App{
// DB: config.DB{
// Dir: "/tmp/TestNew",
// WriteableWindow: 49 * time.Hour,
// },
// },
// },
// Then: "should return a storage with existing blocks loaded",
// ThenExpected: thenExpected{
// storage: storage{
// config: &Config{
// Dir: "/tmp/TestNew",
// Block: BlockConfig{
// WriteableWindow: 49 * time.Hour,
// },
// },
// },
// storageHotLen: 1,
// storageColdLen: 2,
// blockCalls: []func(obj *mock.Mock){
// func(m *mock.Mock) {
// m.On("State").Return(block.StateHot).Once()
// },
// func(m *mock.Mock) {
// m.On("State").Return(block.StateCold).Once()
// },
// func(m *mock.Mock) {
// m.On("State").Return(block.StateCold).Once()
// },
// },
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.Scenario, func(t *testing.T) {
// // Given.
// c := clock.NewMock()
// c.Set(tt.GivenDetail.now)
// clk = c // Set global clock.
// defer func() { clk = clock.New() }()
// // Create test directories if needed
// if len(tt.GivenDetail.blocksOnDisk) > 0 {
// for _, blockDir := range tt.GivenDetail.blocksOnDisk {
// err := os.MkdirAll(tt.WhenDetail.app.DB.Dir+"/"+blockDir, 0755)
// Expect(err).To(BeNil())
// }
// }
// // When.
// var calls int
// var blockCalls []*mock.Mock
// blockFactory := block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.ThenExpected.blockCalls) {
// tt.ThenExpected.blockCalls[calls](obj)
// calls++
// blockCalls = append(blockCalls, obj)
// }
// })
// s, err := new(tt.WhenDetail.app, blockFactory)
// defer os.RemoveAll(tt.WhenDetail.app.DB.Dir)
// // Then.
// Expect(err).To(BeNil())
// Expect(s).NotTo(BeNil())
// storage := s.(*storage)
// Expect(storage.config).To(Equal(tt.ThenExpected.storage.config))
// Expect(len(storage.hot.blocks)).To(Equal(tt.ThenExpected.storageHotLen))
// Expect(len(storage.cold.blocks)).To(Equal(tt.ThenExpected.storageColdLen))
// for _, call := range blockCalls {
// call.AssertExpectations(t)
// }
// })
// }
// }
// func TestAppend(t *testing.T) {
// RegisterTestingT(t)
// type givenDetail struct {
// hotBlocks []func(m *mock.Mock)
// coldBlocks []func(m *mock.Mock)
// }
// type whenDetail struct {
// feeds []*chunk.Feed
// }
// type thenExpected struct {
// err string
// }
// tests := []test.Case[givenDetail, whenDetail, thenExpected]{
// {
// Scenario: "Append feeds to hot block",
// Given: "a storage with one hot block",
// When: "append feeds within hot block time range",
// Then: "should append feeds to hot block successfully",
// GivenDetail: givenDetail{
// hotBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-02T10:00:00Z")).Twice()
// m.On("End").Return(timeutil.MustParse("2025-03-03T10:00:00Z")).Twice()
// m.On("State").Return(block.StateHot).Twice()
// m.On("Append", mock.Anything, []*chunk.Feed{
// {ID: 1, Time: timeutil.MustParse("2025-03-02T11:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-02T12:00:00Z")},
// }).Return(nil)
// },
// },
// },
// WhenDetail: whenDetail{
// feeds: []*chunk.Feed{
// {ID: 1, Time: timeutil.MustParse("2025-03-02T11:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-02T12:00:00Z")},
// },
// },
// ThenExpected: thenExpected{
// err: "",
// },
// },
// {
// Scenario: "Append feeds to non-hot block",
// Given: "a storage with hot and cold blocks",
// When: "append feeds with time in cold block range",
// Then: "should return error",
// GivenDetail: givenDetail{
// coldBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {},
// },
// },
// WhenDetail: whenDetail{
// feeds: []*chunk.Feed{
// {ID: 1, Time: timeutil.MustParse("2025-03-01T11:00:00Z")},
// },
// },
// ThenExpected: thenExpected{
// err: "cannot find hot block",
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.Scenario, func(t *testing.T) {
// // Given.
// calls := 0
// var blockMocks []*mock.Mock
// blockFactory := block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.hotBlocks) {
// tt.GivenDetail.hotBlocks[calls](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var hotBlocks blockChain
// for range tt.GivenDetail.hotBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// hotBlocks.add(block)
// }
// blockFactory = block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.coldBlocks) {
// tt.GivenDetail.coldBlocks[calls](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var coldBlocks blockChain
// for range tt.GivenDetail.coldBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// coldBlocks.add(block)
// }
// s := storage{
// hot: &hotBlocks,
// cold: &coldBlocks,
// }
// // When.
// err := s.Append(context.Background(), tt.WhenDetail.feeds...)
// // Then.
// if tt.ThenExpected.err != "" {
// Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
// } else {
// Expect(err).To(BeNil())
// }
// for _, m := range blockMocks {
// m.AssertExpectations(t)
// }
// })
// }
// }
// func TestQuery(t *testing.T) {
// RegisterTestingT(t)
// type givenDetail struct {
// hotBlocks []func(m *mock.Mock)
// coldBlocks []func(m *mock.Mock)
// }
// type whenDetail struct {
// query block.QueryOptions
// }
// type thenExpected struct {
// feeds []*block.FeedVO
// err string
// }
// tests := []test.Case[givenDetail, whenDetail, thenExpected]{
// {
// Scenario: "Query feeds from hot blocks",
// Given: "a storage with one hot block containing feeds",
// When: "querying with time range within hot block",
// Then: "should return matching feeds from hot block",
// GivenDetail: givenDetail{
// hotBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-02T10:00:00Z")).Once()
// m.On("End").Return(timeutil.MustParse("2025-03-03T10:00:00Z")).Once()
// m.On("Query", mock.Anything, mock.MatchedBy(func(q block.QueryOptions) bool {
// return q.Start.Equal(timeutil.MustParse("2025-03-02T12:00:00Z")) &&
// q.End.Equal(timeutil.MustParse("2025-03-02T14:00:00Z"))
// })).Return([]*block.FeedVO{
// {ID: 1, Time: timeutil.MustParse("2025-03-02T12:30:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-02T13:00:00Z")},
// }, nil)
// },
// },
// },
// WhenDetail: whenDetail{
// query: block.QueryOptions{
// Start: timeutil.MustParse("2025-03-02T12:00:00Z"),
// End: timeutil.MustParse("2025-03-02T14:00:00Z"),
// Limit: 10,
// },
// },
// ThenExpected: thenExpected{
// feeds: []*block.FeedVO{
// {ID: 2, Time: timeutil.MustParse("2025-03-02T13:00:00Z")},
// {ID: 1, Time: timeutil.MustParse("2025-03-02T12:30:00Z")},
// },
// err: "",
// },
// },
// {
// Scenario: "Query feeds from multiple blocks",
// Given: "a storage with hot and cold blocks containing feeds",
// When: "querying with time range spanning multiple blocks",
// Then: "should return combined and sorted feeds from all matching blocks",
// GivenDetail: givenDetail{
// hotBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-02T10:00:00Z"))
// m.On("End").Return(timeutil.MustParse("2025-03-03T10:00:00Z"))
// m.On("Query", mock.Anything, mock.MatchedBy(func(q block.QueryOptions) bool {
// return !q.Start.IsZero() && q.End.IsZero()
// })).Return([]*block.FeedVO{
// {ID: 3, Time: timeutil.MustParse("2025-03-02T15:00:00Z")},
// {ID: 4, Time: timeutil.MustParse("2025-03-02T16:00:00Z")},
// }, nil)
// },
// },
// coldBlocks: []func(m *mock.Mock){
// func(m *mock.Mock) {
// m.On("Start").Return(timeutil.MustParse("2025-03-01T10:00:00Z"))
// m.On("End").Return(timeutil.MustParse("2025-03-02T10:00:00Z"))
// m.On("Query", mock.Anything, mock.MatchedBy(func(q block.QueryOptions) bool {
// return !q.Start.IsZero() && q.End.IsZero()
// })).Return([]*block.FeedVO{
// {ID: 1, Time: timeutil.MustParse("2025-03-01T15:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-01T16:00:00Z")},
// }, nil)
// },
// },
// },
// WhenDetail: whenDetail{
// query: block.QueryOptions{
// Start: timeutil.MustParse("2025-03-01T12:00:00Z"),
// Limit: 3,
// },
// },
// ThenExpected: thenExpected{
// feeds: []*block.FeedVO{
// {ID: 4, Time: timeutil.MustParse("2025-03-02T16:00:00Z")},
// {ID: 3, Time: timeutil.MustParse("2025-03-02T15:00:00Z")},
// {ID: 2, Time: timeutil.MustParse("2025-03-01T16:00:00Z")},
// },
// err: "",
// },
// },
// }
// for _, tt := range tests {
// t.Run(tt.Scenario, func(t *testing.T) {
// // Given.
// calls := 0
// var blockMocks []*mock.Mock
// blockFactory := block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.hotBlocks) {
// tt.GivenDetail.hotBlocks[calls](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var hotBlocks blockChain
// for range tt.GivenDetail.hotBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// hotBlocks.add(block)
// }
// blockFactory = block.NewFactory(func(obj *mock.Mock) {
// if calls < len(tt.GivenDetail.hotBlocks)+len(tt.GivenDetail.coldBlocks) {
// tt.GivenDetail.coldBlocks[calls-len(tt.GivenDetail.hotBlocks)](obj)
// calls++
// blockMocks = append(blockMocks, obj)
// }
// })
// var coldBlocks blockChain
// for range tt.GivenDetail.coldBlocks {
// block, err := blockFactory.New(nil, nil, nil, nil, nil)
// Expect(err).To(BeNil())
// coldBlocks.add(block)
// }
// s := storage{
// hot: &hotBlocks,
// cold: &coldBlocks,
// }
// // When.
// feeds, err := s.Query(context.Background(), tt.WhenDetail.query)
// // Then.
// if tt.ThenExpected.err != "" {
// Expect(err).NotTo(BeNil())
// Expect(err.Error()).To(ContainSubstring(tt.ThenExpected.err))
// } else {
// Expect(err).To(BeNil())
// Expect(feeds).To(HaveLen(len(tt.ThenExpected.feeds)))
// // Check feeds match expected
// for i, feed := range feeds {
// Expect(feed.ID).To(Equal(tt.ThenExpected.feeds[i].ID))
// Expect(feed.Time).To(Equal(tt.ThenExpected.feeds[i].Time))
// Expect(feed.Labels).To(Equal(tt.ThenExpected.feeds[i].Labels))
// }
// }
// for _, m := range blockMocks {
// m.AssertExpectations(t)
// }
// })
// }
// }

View File

@@ -32,8 +32,8 @@ import (
// --- Interface code block ---
type Storage interface {
component.Component
Get(ctx context.Context, key string) (string, error)
Set(ctx context.Context, key string, value string, ttl time.Duration) error
Get(ctx context.Context, key []byte) ([]byte, error)
Set(ctx context.Context, key []byte, value []byte, ttl time.Duration) error
}
var ErrNotFound = errors.New("not found")
@@ -137,7 +137,7 @@ func (k *kv) Close() error {
const bucket = "0"
func (k *kv) Get(ctx context.Context, key string) (value string, err error) {
func (k *kv) Get(ctx context.Context, key []byte) (value []byte, err error) {
ctx = telemetry.StartWith(ctx, append(k.TelemetryLabels(), telemetrymodel.KeyOperation, "Get")...)
defer func() {
telemetry.End(ctx, func() error {
@@ -157,22 +157,22 @@ func (k *kv) Get(ctx context.Context, key string) (value string, err error) {
})
switch {
case err == nil:
return string(b), nil
return b, nil
case errors.Is(err, nutsdb.ErrNotFoundKey):
return "", ErrNotFound
return nil, ErrNotFound
case strings.Contains(err.Error(), "key not found"):
return "", ErrNotFound
return nil, ErrNotFound
default:
return "", err
return nil, err
}
}
func (k *kv) Set(ctx context.Context, key string, value string, ttl time.Duration) (err error) {
func (k *kv) Set(ctx context.Context, key []byte, value []byte, ttl time.Duration) (err error) {
ctx = telemetry.StartWith(ctx, append(k.TelemetryLabels(), telemetrymodel.KeyOperation, "Set")...)
defer func() { telemetry.End(ctx, err) }()
return k.db.Update(func(tx *nutsdb.Tx) error {
return tx.Put(bucket, []byte(key), []byte(value), uint32(ttl.Seconds()))
return tx.Put(bucket, key, value, uint32(ttl.Seconds()))
})
}
@@ -180,13 +180,13 @@ type mockKV struct {
component.Mock
}
func (m *mockKV) Get(ctx context.Context, key string) (string, error) {
func (m *mockKV) Get(ctx context.Context, key []byte) ([]byte, error) {
args := m.Called(ctx, key)
return args.String(0), args.Error(1)
return args.Get(0).([]byte), args.Error(1)
}
func (m *mockKV) Set(ctx context.Context, key string, value string, ttl time.Duration) error {
func (m *mockKV) Set(ctx context.Context, key []byte, value []byte, ttl time.Duration) error {
args := m.Called(ctx, key, value, ttl)
return args.Error(0)

View File

@@ -0,0 +1,229 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package object
import (
"context"
"io"
"net/url"
"strings"
"github.com/minio/minio-go/v7"
"github.com/minio/minio-go/v7/pkg/credentials"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
"github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
)
// --- Interface code block ---
type Storage interface {
component.Component
config.Watcher
Put(ctx context.Context, key string, body io.Reader, contentType string) (url string, err error)
Get(ctx context.Context, key string) (url string, err error)
}
var ErrNotFound = errors.New("not found")
type Config struct {
Endpoint string
AccessKeyID string
SecretAccessKey string
client *minio.Client
Bucket string
BucketURL string
bucketURL *url.URL
}
func (c *Config) Validate() error {
if c.Empty() {
return nil
}
if c.Endpoint == "" {
return errors.New("endpoint is required")
}
c.Endpoint = strings.TrimPrefix(c.Endpoint, "https://") // S3 endpoint should not have https:// prefix.
c.Endpoint = strings.TrimPrefix(c.Endpoint, "http://")
if c.AccessKeyID == "" {
return errors.New("access key id is required")
}
if c.SecretAccessKey == "" {
return errors.New("secret access key is required")
}
client, err := minio.New(c.Endpoint, &minio.Options{
Creds: credentials.NewStaticV4(c.AccessKeyID, c.SecretAccessKey, ""),
Secure: true,
})
if err != nil {
return errors.Wrap(err, "new minio client")
}
c.client = client
if c.Bucket == "" {
return errors.New("bucket is required")
}
if c.BucketURL == "" {
return errors.New("bucket url is required")
}
u, err := url.Parse(c.BucketURL)
if err != nil {
return errors.Wrap(err, "parse public url")
}
c.bucketURL = u
return nil
}
func (c *Config) From(app *config.App) *Config {
*c = Config{
Endpoint: app.Storage.Object.Endpoint,
AccessKeyID: app.Storage.Object.AccessKeyID,
SecretAccessKey: app.Storage.Object.SecretAccessKey,
Bucket: app.Storage.Object.Bucket,
BucketURL: app.Storage.Object.BucketURL,
}
return c
}
func (c *Config) Empty() bool {
return c.Endpoint == "" && c.AccessKeyID == "" && c.SecretAccessKey == "" && c.Bucket == "" && c.BucketURL == ""
}
type Dependencies struct{}
// --- Factory code block ---
type Factory component.Factory[Storage, config.App, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[Storage, config.App, Dependencies](
func(instance string, config *config.App, dependencies Dependencies) (Storage, error) {
m := &mockStorage{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
},
)
}
return component.FactoryFunc[Storage, config.App, Dependencies](new)
}
func new(instance string, app *config.App, dependencies Dependencies) (Storage, error) {
config := &Config{}
config.From(app)
if err := config.Validate(); err != nil {
return nil, errors.Wrap(err, "validate config")
}
return &s3{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "ObjectStorage",
Instance: instance,
Config: config,
Dependencies: dependencies,
}),
}, nil
}
// --- Implementation code block ---
type s3 struct {
*component.Base[Config, Dependencies]
}
func (s *s3) Put(ctx context.Context, key string, body io.Reader, contentType string) (publicURL string, err error) {
ctx = telemetry.StartWith(ctx, append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Put")...)
defer func() { telemetry.End(ctx, err) }()
config := s.Config()
if config.Empty() {
return "", errors.New("not configured")
}
if _, err := config.client.PutObject(ctx, config.Bucket, key, body, -1, minio.PutObjectOptions{
ContentType: contentType,
}); err != nil {
return "", errors.Wrap(err, "put object")
}
return config.bucketURL.JoinPath(key).String(), nil
}
func (s *s3) Get(ctx context.Context, key string) (publicURL string, err error) {
ctx = telemetry.StartWith(ctx, append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Get")...)
defer func() { telemetry.End(ctx, err) }()
config := s.Config()
if config.Empty() {
return "", errors.New("not configured")
}
if _, err := config.client.StatObject(ctx, config.Bucket, key, minio.StatObjectOptions{}); err != nil {
errResponse := minio.ToErrorResponse(err)
if errResponse.Code == minio.NoSuchKey {
return "", ErrNotFound
}
return "", errors.Wrap(err, "stat object")
}
return config.bucketURL.JoinPath(key).String(), nil
}
func (s *s3) Reload(app *config.App) (err error) {
ctx := telemetry.StartWith(s.Context(), append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Reload")...)
defer func() { telemetry.End(ctx, err) }()
newConfig := &Config{}
newConfig.From(app)
if err := newConfig.Validate(); err != nil {
return errors.Wrap(err, "validate config")
}
s.SetConfig(newConfig)
log.Info(ctx, "object storage reloaded")
return nil
}
// --- Mock code block ---
type mockStorage struct {
component.Mock
}
func (m *mockStorage) Put(ctx context.Context, key string, body io.Reader, contentType string) (string, error) {
args := m.Called(ctx, key, body, contentType)
return args.String(0), args.Error(1)
}
func (m *mockStorage) Get(ctx context.Context, key string) (string, error) {
args := m.Called(ctx, key)
return args.String(0), args.Error(1)
}
func (m *mockStorage) Reload(app *config.App) error {
args := m.Called(app)
return args.Error(0)
}

View File

@@ -27,6 +27,8 @@ import (
"github.com/pkg/errors"
slogdedup "github.com/veqryn/slog-dedup"
"github.com/glidea/zenfeed/pkg/model"
)
type Level string
@@ -187,7 +189,8 @@ func getStack(skip, depth int) string {
}
first = false
b.WriteString(frame.Function)
fn := strings.TrimPrefix(frame.Function, model.Module) // no module prefix for zenfeed self.
b.WriteString(fn)
b.WriteByte(':')
b.WriteString(strconv.Itoa(frame.Line))
}

View File

@@ -0,0 +1,137 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package http
import (
"net"
"net/http"
"net/http/pprof"
"github.com/pkg/errors"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
telemetry "github.com/glidea/zenfeed/pkg/telemetry"
"github.com/glidea/zenfeed/pkg/telemetry/log"
"github.com/glidea/zenfeed/pkg/telemetry/metric"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
)
// --- Interface code block ---
type Server interface {
component.Component
}
type Config struct {
Address string
}
func (c *Config) Validate() error {
if c.Address == "" {
c.Address = ":9090"
}
if _, _, err := net.SplitHostPort(c.Address); err != nil {
return errors.Wrap(err, "invalid address")
}
return nil
}
func (c *Config) From(app *config.App) *Config {
c.Address = app.Telemetry.Address
return c
}
type Dependencies struct {
}
// --- Factory code block ---
type Factory component.Factory[Server, config.App, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[Server, config.App, Dependencies](
func(instance string, config *config.App, dependencies Dependencies) (Server, error) {
m := &mockServer{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
},
)
}
return component.FactoryFunc[Server, config.App, Dependencies](new)
}
func new(instance string, app *config.App, dependencies Dependencies) (Server, error) {
config := &Config{}
config.From(app)
if err := config.Validate(); err != nil {
return nil, errors.Wrap(err, "validate config")
}
router := http.NewServeMux()
router.Handle("/health", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
}))
router.Handle("/metrics", metric.Handler())
router.HandleFunc("/pprof", pprof.Index)
router.HandleFunc("/pprof/cmdline", pprof.Cmdline)
router.HandleFunc("/pprof/profile", pprof.Profile)
router.HandleFunc("/pprof/symbol", pprof.Symbol)
router.HandleFunc("/pprof/trace", pprof.Trace)
return &server{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "TelemetryServer",
Instance: instance,
Config: config,
Dependencies: dependencies,
}),
http: &http.Server{Addr: config.Address, Handler: router},
}, nil
}
// --- Implementation code block ---
type server struct {
*component.Base[Config, Dependencies]
http *http.Server
}
func (s *server) Run() (err error) {
ctx := telemetry.StartWith(s.Context(), append(s.TelemetryLabels(), telemetrymodel.KeyOperation, "Run")...)
defer func() { telemetry.End(ctx, err) }()
serverErr := make(chan error, 1)
go func() {
serverErr <- s.http.ListenAndServe()
}()
s.MarkReady()
select {
case <-ctx.Done():
log.Info(ctx, "shutting down")
return s.http.Shutdown(ctx)
case err := <-serverErr:
return errors.Wrap(err, "listen and serve")
}
}
type mockServer struct {
component.Mock
}

View File

@@ -122,6 +122,32 @@ func ReadUint32(r io.Reader) (uint32, error) {
return binary.LittleEndian.Uint32(b), nil
}
// WriteUint16 writes a uint16 using a pooled buffer.
func WriteUint16(w io.Writer, v uint16) error {
bp := smallBufPool.Get().(*[]byte)
defer smallBufPool.Put(bp)
b := *bp
binary.LittleEndian.PutUint16(b, v)
_, err := w.Write(b[:2])
return err
}
// ReadUint16 reads a uint16 using a pooled buffer.
func ReadUint16(r io.Reader) (uint16, error) {
bp := smallBufPool.Get().(*[]byte)
defer smallBufPool.Put(bp)
b := (*bp)[:2]
// Read exactly 2 bytes into the slice.
if _, err := io.ReadFull(r, b); err != nil {
return 0, errors.Wrap(err, "read uint16")
}
return binary.LittleEndian.Uint16(b), nil
}
// WriteFloat32 writes a float32 using a pooled buffer.
func WriteFloat32(w io.Writer, v float32) error {
return WriteUint32(w, math.Float32bits(v))

180
pkg/util/crawl/crawl.go Normal file
View File

@@ -0,0 +1,180 @@
package crawl
import (
"context"
"fmt"
"io"
"net/http"
"net/url"
"sync"
"github.com/pkg/errors"
"github.com/temoto/robotstxt"
"github.com/glidea/zenfeed/pkg/util/text_convert"
)
type Crawler interface {
Markdown(ctx context.Context, u string) ([]byte, error)
}
type local struct {
hc *http.Client
robotsDataCache sync.Map
}
func NewLocal() Crawler {
return &local{
hc: &http.Client{},
}
}
func (c *local) Markdown(ctx context.Context, u string) ([]byte, error) {
// Check if the page is allowed.
if err := c.checkAllowed(ctx, u); err != nil {
return nil, errors.Wrapf(err, "check robots.txt for %s", u)
}
// Prepare the request.
req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", u)
}
req.Header.Set("User-Agent", userAgent)
// Send the request.
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", u)
}
defer func() { _ = resp.Body.Close() }()
// Parse the response.
if resp.StatusCode != http.StatusOK {
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, u)
}
bodyBytes, err := io.ReadAll(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "read body from %s", u)
}
// Convert the body to markdown.
mdBytes, err := textconvert.HTMLToMarkdown(bodyBytes)
if err != nil {
return nil, errors.Wrap(err, "convert html to markdown")
}
return mdBytes, nil
}
const userAgent = "ZenFeed"
func (c *local) checkAllowed(ctx context.Context, u string) error {
parsedURL, err := url.Parse(u)
if err != nil {
return errors.Wrapf(err, "parse url %s", u)
}
d, err := c.getRobotsData(ctx, parsedURL.Host)
if err != nil {
return errors.Wrapf(err, "check robots.txt for %s", parsedURL.Host)
}
if !d.TestAgent(parsedURL.Path, userAgent) {
return errors.Errorf("disallowed by robots.txt for %s", u)
}
return nil
}
// getRobotsData fetches and parses robots.txt for a given host.
func (c *local) getRobotsData(ctx context.Context, host string) (*robotstxt.RobotsData, error) {
// Check the cache.
if data, found := c.robotsDataCache.Load(host); found {
return data.(*robotstxt.RobotsData), nil
}
// Prepare the request.
robotsURL := fmt.Sprintf("https://%s/robots.txt", host)
req, err := http.NewRequestWithContext(ctx, http.MethodGet, robotsURL, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", robotsURL)
}
req.Header.Set("User-Agent", userAgent)
// Send the request.
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", robotsURL)
}
defer func() { _ = resp.Body.Close() }()
// Parse the response.
switch resp.StatusCode {
case http.StatusOK:
data, err := robotstxt.FromResponse(resp)
if err != nil {
return nil, errors.Wrapf(err, "parse robots.txt from %s", robotsURL)
}
c.robotsDataCache.Store(host, data)
return data, nil
case http.StatusNotFound:
data := &robotstxt.RobotsData{}
c.robotsDataCache.Store(host, data)
return data, nil
case http.StatusUnauthorized, http.StatusForbidden:
return nil, errors.Errorf("access to %s denied (status %d)", robotsURL, resp.StatusCode)
default:
return nil, errors.Errorf("unexpected status %d fetching %s", resp.StatusCode, robotsURL)
}
}
type jina struct {
hc *http.Client
token string
}
func NewJina(token string) Crawler {
return &jina{
hc: &http.Client{},
// If token is empty, will not affect to use, but rate limit will be lower.
// See https://jina.ai/api-dashboard/rate-limit.
token: token,
}
}
func (c *jina) Markdown(ctx context.Context, u string) ([]byte, error) {
proxyURL := "https://r.jina.ai/" + u
req, err := http.NewRequestWithContext(ctx, http.MethodGet, proxyURL, nil)
if err != nil {
return nil, errors.Wrapf(err, "create request for %s", u)
}
req.Header.Set("X-Engine", "browser")
req.Header.Set("X-Robots-Txt", userAgent)
if c.token != "" {
req.Header.Set("Authorization", "Bearer "+c.token)
}
resp, err := c.hc.Do(req)
if err != nil {
return nil, errors.Wrapf(err, "fetch %s", proxyURL)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode != http.StatusOK {
return nil, errors.Errorf("received non-200 status code %d from %s", resp.StatusCode, proxyURL)
}
mdBytes, err := io.ReadAll(resp.Body)
if err != nil {
return nil, errors.Wrapf(err, "read body from %s", proxyURL)
}
return mdBytes, nil
}

View File

@@ -13,39 +13,19 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package rpc
package jsonrpc
import (
"context"
"encoding/json"
"errors"
"net/http"
"github.com/glidea/zenfeed/pkg/api"
)
type Handler[Request any, Response any] func(ctx context.Context, req *Request) (*Response, error)
var (
ErrBadRequest = func(err error) Error { return newError(http.StatusBadRequest, err) }
ErrNotFound = func(err error) Error { return newError(http.StatusNotFound, err) }
ErrInternal = func(err error) Error { return newError(http.StatusInternalServerError, err) }
)
type Error struct {
Code int `json:"code"`
Message string `json:"message"`
}
func (e Error) Error() string {
return e.Message
}
func newError(code int, err error) Error {
return Error{
Code: code,
Message: err.Error(),
}
}
func API[Request any, Response any](handler Handler[Request, Response]) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
allowCORS(w)
@@ -65,11 +45,11 @@ func API[Request any, Response any](handler Handler[Request, Response]) http.Han
resp, err := handler(r.Context(), &req)
if err != nil {
var rpcErr Error
if errors.As(err, &rpcErr) {
var apiErr api.Error
if errors.As(err, &apiErr) {
w.Header().Set("Content-Type", "application/json")
w.WriteHeader(rpcErr.Code)
_ = json.NewEncoder(w).Encode(rpcErr)
w.WriteHeader(apiErr.Code)
_ = json.NewEncoder(w).Encode(apiErr)
return
}

View File

@@ -13,7 +13,7 @@
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package rpc
package jsonrpc
import (
"bytes"
@@ -27,6 +27,7 @@ import (
. "github.com/onsi/gomega"
"github.com/glidea/zenfeed/pkg/api"
"github.com/glidea/zenfeed/pkg/test"
)
@@ -58,15 +59,15 @@ func TestAPI(t *testing.T) {
}
badRequestHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
return nil, ErrBadRequest(errors.New("invalid request"))
return nil, api.ErrBadRequest(errors.New("invalid request"))
}
notFoundHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
return nil, ErrNotFound(errors.New("resource not found"))
return nil, api.ErrNotFound(errors.New("resource not found"))
}
internalErrorHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {
return nil, ErrInternal(errors.New("server error"))
return nil, api.ErrInternal(errors.New("server error"))
}
genericErrorHandler := func(ctx context.Context, req *TestRequest) (*TestResponse, error) {

View File

@@ -17,11 +17,13 @@ package time
import (
"context"
"encoding/json"
"math/rand"
"time"
_ "time/tzdata"
"github.com/pkg/errors"
"gopkg.in/yaml.v3"
runtimeutil "github.com/glidea/zenfeed/pkg/util/runtime"
)
@@ -84,3 +86,60 @@ func Tick(ctx context.Context, d time.Duration, f func() error) error {
func Random(max time.Duration) time.Duration {
return time.Duration(rand.Int63n(int64(max)))
}
type Duration time.Duration
func (d Duration) String() string {
return time.Duration(d).String()
}
func (d Duration) MarshalJSON() ([]byte, error) {
return json.Marshal(d.String())
}
func (d *Duration) UnmarshalJSON(b []byte) error {
var v any
if err := json.Unmarshal(b, &v); err != nil {
return err
}
switch tv := v.(type) {
case float64:
*d = Duration(time.Duration(tv))
return nil
case string:
parsed, err := time.ParseDuration(tv)
if err != nil {
return err
}
*d = Duration(parsed)
return nil
default:
return errors.Errorf("invalid duration: %v", tv)
}
}
func (d Duration) MarshalYAML() (interface{}, error) {
return d.String(), nil
}
func (d *Duration) UnmarshalYAML(value *yaml.Node) error {
if value.Kind != yaml.ScalarNode {
return errors.Errorf("invalid duration: expected a scalar node, got %v", value.Kind)
}
s := value.Value
parsed, err := time.ParseDuration(s)
if err != nil {
return errors.Errorf("failed to parse duration string '%s' from YAML: %s", s, err.Error())
}
*d = Duration(parsed)
return nil
}

100
pkg/util/wav/wav.go Normal file
View File

@@ -0,0 +1,100 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package wav
import (
"io"
"github.com/pkg/errors"
binaryutil "github.com/glidea/zenfeed/pkg/util/binary"
)
// Header contains the WAV header information.
type Header struct {
SampleRate uint32
BitDepth uint16
NumChannels uint16
}
// WriteHeader writes the WAV header to a writer.
// pcmDataSize is the size of the raw PCM data.
func WriteHeader(w io.Writer, h *Header, pcmDataSize uint32) error {
// RIFF Header.
if err := writeRIFFHeader(w, pcmDataSize); err != nil {
return errors.Wrap(err, "write RIFF header")
}
// fmt chunk.
if err := writeFMTChunk(w, h); err != nil {
return errors.Wrap(err, "write fmt chunk")
}
// data chunk.
if _, err := w.Write([]byte("data")); err != nil {
return errors.Wrap(err, "write data chunk marker")
}
if err := binaryutil.WriteUint32(w, pcmDataSize); err != nil {
return errors.Wrap(err, "write pcm data size")
}
return nil
}
func writeRIFFHeader(w io.Writer, pcmDataSize uint32) error {
if _, err := w.Write([]byte("RIFF")); err != nil {
return errors.Wrap(err, "write RIFF")
}
if err := binaryutil.WriteUint32(w, uint32(36+pcmDataSize)); err != nil {
return errors.Wrap(err, "write file size")
}
if _, err := w.Write([]byte("WAVE")); err != nil {
return errors.Wrap(err, "write WAVE")
}
return nil
}
func writeFMTChunk(w io.Writer, h *Header) error {
if _, err := w.Write([]byte("fmt ")); err != nil {
return errors.Wrap(err, "write fmt")
}
if err := binaryutil.WriteUint32(w, uint32(16)); err != nil { // PCM chunk size.
return errors.Wrap(err, "write pcm chunk size")
}
if err := binaryutil.WriteUint16(w, uint16(1)); err != nil { // PCM format.
return errors.Wrap(err, "write pcm format")
}
if err := binaryutil.WriteUint16(w, h.NumChannels); err != nil {
return errors.Wrap(err, "write num channels")
}
if err := binaryutil.WriteUint32(w, h.SampleRate); err != nil {
return errors.Wrap(err, "write sample rate")
}
byteRate := h.SampleRate * uint32(h.NumChannels) * uint32(h.BitDepth) / 8
if err := binaryutil.WriteUint32(w, byteRate); err != nil {
return errors.Wrap(err, "write byte rate")
}
blockAlign := h.NumChannels * h.BitDepth / 8
if err := binaryutil.WriteUint16(w, blockAlign); err != nil {
return errors.Wrap(err, "write block align")
}
if err := binaryutil.WriteUint16(w, h.BitDepth); err != nil {
return errors.Wrap(err, "write bit depth")
}
return nil
}

161
pkg/util/wav/wav_test.go Normal file
View File

@@ -0,0 +1,161 @@
// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package wav
import (
"bytes"
"testing"
. "github.com/onsi/gomega"
"github.com/glidea/zenfeed/pkg/test"
)
func TestWriteHeader(t *testing.T) {
RegisterTestingT(t)
type givenDetail struct{}
type whenDetail struct {
header *Header
pcmDataSize uint32
}
type thenExpected struct {
expectedBytes []byte
expectError bool
}
tests := []test.Case[givenDetail, whenDetail, thenExpected]{
{
Scenario: "Standard CD quality audio",
Given: "a header for CD quality audio and a non-zero data size",
When: "writing the header",
Then: "should produce a valid 44-byte WAV header and no error",
GivenDetail: givenDetail{},
WhenDetail: whenDetail{
header: &Header{
SampleRate: 44100,
BitDepth: 16,
NumChannels: 2,
},
pcmDataSize: 176400,
},
ThenExpected: thenExpected{
expectedBytes: []byte{
'R', 'I', 'F', 'F',
0x34, 0xB1, 0x02, 0x00, // ChunkSize = 36 + 176400 = 176436
'W', 'A', 'V', 'E',
'f', 'm', 't', ' ',
0x10, 0x00, 0x00, 0x00, // Subchunk1Size = 16
0x01, 0x00, // AudioFormat = 1 (PCM)
0x02, 0x00, // NumChannels = 2
0x44, 0xAC, 0x00, 0x00, // SampleRate = 44100
0x10, 0xB1, 0x02, 0x00, // ByteRate = 176400
0x04, 0x00, // BlockAlign = 4
0x10, 0x00, // BitsPerSample = 16
'd', 'a', 't', 'a',
0x10, 0xB1, 0x02, 0x00, // Subchunk2Size = 176400
},
expectError: false,
},
},
{
Scenario: "Mono audio for speech",
Given: "a header for mono speech audio and a non-zero data size",
When: "writing the header",
Then: "should produce a valid 44-byte WAV header and no error",
GivenDetail: givenDetail{},
WhenDetail: whenDetail{
header: &Header{
SampleRate: 16000,
BitDepth: 16,
NumChannels: 1,
},
pcmDataSize: 32000,
},
ThenExpected: thenExpected{
expectedBytes: []byte{
'R', 'I', 'F', 'F',
0x24, 0x7D, 0x00, 0x00, // ChunkSize = 36 + 32000 = 32036
'W', 'A', 'V', 'E',
'f', 'm', 't', ' ',
0x10, 0x00, 0x00, 0x00, // Subchunk1Size = 16
0x01, 0x00, // AudioFormat = 1
0x01, 0x00, // NumChannels = 1
0x80, 0x3E, 0x00, 0x00, // SampleRate = 16000
0x00, 0x7D, 0x00, 0x00, // ByteRate = 32000
0x02, 0x00, // BlockAlign = 2
0x10, 0x00, // BitsPerSample = 16
'd', 'a', 't', 'a',
0x00, 0x7D, 0x00, 0x00, // Subchunk2Size = 32000
},
expectError: false,
},
},
{
Scenario: "8-bit mono audio with zero data size",
Given: "a header for 8-bit mono audio and a zero data size",
When: "writing the header for an empty file",
Then: "should produce a valid 44-byte WAV header with data size 0",
GivenDetail: givenDetail{},
WhenDetail: whenDetail{
header: &Header{
SampleRate: 8000,
BitDepth: 8,
NumChannels: 1,
},
pcmDataSize: 0,
},
ThenExpected: thenExpected{
expectedBytes: []byte{
'R', 'I', 'F', 'F',
0x24, 0x00, 0x00, 0x00, // ChunkSize = 36 + 0 = 36
'W', 'A', 'V', 'E',
'f', 'm', 't', ' ',
0x10, 0x00, 0x00, 0x00, // Subchunk1Size = 16
0x01, 0x00, // AudioFormat = 1
0x01, 0x00, // NumChannels = 1
0x40, 0x1F, 0x00, 0x00, // SampleRate = 8000
0x40, 0x1F, 0x00, 0x00, // ByteRate = 8000
0x01, 0x00, // BlockAlign = 1
0x08, 0x00, // BitsPerSample = 8
'd', 'a', 't', 'a',
0x00, 0x00, 0x00, 0x00, // Subchunk2Size = 0
},
expectError: false,
},
},
}
for _, tt := range tests {
t.Run(tt.Scenario, func(t *testing.T) {
// Given.
var buf bytes.Buffer
// When.
err := WriteHeader(&buf, tt.WhenDetail.header, tt.WhenDetail.pcmDataSize)
// Then.
if tt.ThenExpected.expectError {
Expect(err).To(HaveOccurred())
} else {
Expect(err).NotTo(HaveOccurred())
Expect(buf.Bytes()).To(Equal(tt.ThenExpected.expectedBytes))
}
})
}
}