Files
zenfeed/pkg/rewrite/rewrite.go
2025-04-28 23:29:34 +08:00

574 lines
22 KiB
Go

// Copyright (C) 2025 wangyusong
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.
package rewrite
import (
"context"
"html/template"
"regexp"
"unicode/utf8"
"unsafe"
"github.com/pkg/errors"
"k8s.io/utils/ptr"
"github.com/glidea/zenfeed/pkg/component"
"github.com/glidea/zenfeed/pkg/config"
"github.com/glidea/zenfeed/pkg/llm"
"github.com/glidea/zenfeed/pkg/model"
"github.com/glidea/zenfeed/pkg/telemetry"
telemetrymodel "github.com/glidea/zenfeed/pkg/telemetry/model"
"github.com/glidea/zenfeed/pkg/util/buffer"
)
// --- Interface code block ---
type Rewriter interface {
component.Component
config.Watcher
// Labels applies rewrite rules to the given labels and returns the modified labels.
// Note: this method modifies the input labels in place.
// If a rule's action is ActionDropFeed, it returns nil to indicate the item should be dropped.
Labels(ctx context.Context, labels model.Labels) (model.Labels, error)
}
type Config []Rule
func (c *Config) Validate() error {
for i := range *c {
if err := (*c)[i].Validate(); err != nil {
return errors.Wrapf(err, "validate and adjust rewrite config")
}
}
return nil
}
func (c *Config) From(app *config.App) {
for _, r := range app.Storage.Feed.Rewrites {
var rc Rule
rc.From(&r)
*c = append(*c, rc)
}
}
type Dependencies struct {
LLMFactory llm.Factory
}
type Rule struct {
// SourceLabel specifies which label's value to use as source text.
// Default is model.LabelContent.
SourceLabel string
// SkipTooShortThreshold is the threshold of the source text length.
// If the source text is shorter than this threshold, it will be skipped.
SkipTooShortThreshold *int
// Transform used to transform the source text.
// If not set, transform to original source text.
Transform *Transform
// Match used to match the text after transform.
// If not set, match all.
Match string
matchRE *regexp.Regexp
// Action determines what to do if matchs.
Action Action
// Label is the label to create or update.
Label string
}
func (r *Rule) Validate() error { //nolint:cyclop
// Source label.
if r.SourceLabel == "" {
r.SourceLabel = model.LabelContent
}
if r.SkipTooShortThreshold == nil {
r.SkipTooShortThreshold = ptr.To(300)
}
// Transform.
if r.Transform != nil {
if r.Transform.ToText.Prompt == "" {
return errors.New("to text prompt is required")
}
tmpl, err := template.New("").Parse(r.Transform.ToText.Prompt)
if err != nil {
return errors.Wrapf(err, "parse prompt template %s", r.Transform.ToText.Prompt)
}
buf := buffer.Get()
defer buffer.Put(buf)
if err := tmpl.Execute(buf, promptTemplates); err != nil {
return errors.Wrapf(err, "execute prompt template %s", r.Transform.ToText.Prompt)
}
r.Transform.ToText.promptRendered = buf.String()
}
// Match.
if r.Match == "" {
r.Match = ".*"
}
re, err := regexp.Compile(r.Match)
if err != nil {
return errors.Wrapf(err, "compile match regex %s", r.Match)
}
r.matchRE = re
// Action.
switch r.Action {
case "":
r.Action = ActionCreateOrUpdateLabel
case ActionCreateOrUpdateLabel:
if r.Label == "" {
return errors.New("label is required for create or update label action")
}
case ActionDropFeed:
default:
return errors.Errorf("invalid action: %s", r.Action)
}
return nil
}
func (r *Rule) From(c *config.RewriteRule) {
r.SourceLabel = c.SourceLabel
r.SkipTooShortThreshold = c.SkipTooShortThreshold
if c.Transform != nil {
t := &Transform{}
if c.Transform.ToText != nil {
t.ToText = &ToText{
LLM: c.Transform.ToText.LLM,
Prompt: c.Transform.ToText.Prompt,
}
}
r.Transform = t
}
r.Match = c.Match
if r.Match == "" {
r.Match = c.MatchRE
}
r.Action = Action(c.Action)
r.Label = c.Label
}
type Transform struct {
ToText *ToText
}
type ToText struct {
// LLM is the name of the LLM to use.
LLM string
// Prompt is the prompt for LLM completion.
// The source text will automatically be injected into the prompt.
Prompt string
promptRendered string
}
type Action string
const (
ActionDropFeed Action = "drop_feed"
ActionCreateOrUpdateLabel Action = "create_or_update_label"
)
var promptTemplates = map[string]string{
"category": `
Analyze the content and categorize it into exactly one of these categories:
Technology, Development, Entertainment, Finance, Health, Politics, Other
Classification requirements:
- Choose the SINGLE most appropriate category based on:
* Primary topic and main focus of the content
* Key terminology and concepts used
* Target audience and purpose
* Technical depth and complexity level
- For content that could fit multiple categories:
* Identify the dominant theme
* Consider the most specific applicable category
* Use the primary intended purpose
- If content appears ambiguous:
* Focus on the most prominent aspects
* Consider the practical application
* Choose the category that best serves user needs
Output format:
Return ONLY the category name, no other text or explanation.
Must be one of the provided categories exactly as written.
`,
"tags": `
Analyze the content and add appropriate tags based on:
- Main topics and themes
- Key concepts and terminology
- Target audience and purpose
- Technical depth and domain
- 2-4 tags are enough
Output format:
Return a list of tags, separated by commas, no other text or explanation.
e.g. "AI, Technology, Innovation, Future"
`,
"score": `
Please give a score between 0 and 10 based on the following content.
Evaluate the content comprehensively considering clarity, accuracy, depth, logical structure, language expression, and completeness.
Note: If the content is an article or a text intended to be detailed, the length is an important factor. Generally, content under 300 words may receive a lower score due to lack of substance, unless its type (such as poetry or summary) is inherently suitable for brevity.
Output format:
Return the score (0-10), no other text or explanation.
E.g. "8", "5", "3", etc.
`,
"comment_confucius": `
Please act as Confucius and write a 100-word comment on the article.
Content needs to be in line with the Chinese mainland's regulations.
Output format:
Return the comment only, no other text or explanation.
Reply short and concise, 100 words is enough.
`,
"summary": `
Summarize the article in 100-200 words.
`,
"summary_html_snippet": `
# Task: Create Visually Appealing Information Summary Emails
You are a professional content designer. Please convert the provided articles into **visually modern HTML email segments**, focusing on display effects in modern clients like Gmail and QQ Mail.
## Key Requirements:
1. **Output Format**:
- Only output HTML code snippets, **no need for complete HTML document structure**
- Only generate HTML code for a single article, so users can combine multiple pieces into a complete email
- No explanations, additional comments, or markups
- **No need to add titles and sources**, users will inject them automatically
- No use html backticks, output raw html code directly
- Output directly, no explanation, no comments, no markups
2. **Content Processing**:
- **Don't directly copy the original text**, but extract key information and core insights from each article
- **Each article summary should be 100-200 words**, don't force word count, adjust the word count based on the actual length of the article
- Summarize points in relaxed, natural language, as if chatting with friends, while maintaining depth
- Maintain the original language of the article (e.g., Chinese summary for Chinese articles)
3. **Visual Design**:
- Design should be aesthetically pleasing with coordinated colors
- Use sufficient whitespace and contrast
- Maintain a consistent visual style across all articles
- **Must use multiple visual elements** (charts, cards, quote blocks, etc.), avoid pure text presentation
- Each article should use at least 2-3 different visual elements to make content more intuitive and readable
4. **Highlight Techniques**:
A. **Beautiful Quote Blocks** (for highlighting important viewpoints):
<div style="margin:20px 0; padding:20px; background:linear-gradient(to right, #f8f9fa, #ffffff); border-left:5px solid #4285f4; border-radius:5px; box-shadow:0 2px 8px rgba(0,0,0,0.05);">
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; line-height:1.6; color:#333; font-weight:500;">
Here is the key viewpoint or finding that needs to be highlighted.
</p>
</div>
B. **Information Cards** (for highlighting key data):
<div style="display:inline-block; margin:10px 10px 10px 0; padding:15px 20px; background-color:#ffffff; border-radius:8px; box-shadow:0 3px 10px rgba(0,0,0,0.08); min-width:120px; text-align:center;">
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#666;">Metric Name</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:24px; font-weight:600; color:#1a73e8;">75%</p>
</div>
C. **Key Points List** (for highlighting multiple points):
<ul style="margin:20px 0; padding-left:0; list-style-type:none;">
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">1</span>
First point description
</li>
<li style="position:relative; margin-bottom:12px; padding-left:28px; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#444;">
<span style="position:absolute; left:0; top:0; width:18px; height:18px; background-color:#4285f4; border-radius:50%; color:white; text-align:center; line-height:18px; font-size:12px;">2</span>
Second point description
</li>
</ul>
D. **Emphasis Text** (for highlighting key words or phrases):
<span style="background:linear-gradient(180deg, rgba(255,255,255,0) 50%, rgba(66,133,244,0.2) 50%); padding:0 2px;">Text to emphasize</span>
5. **Timeline Design** (suitable for event sequences or news developments):
<div style="margin:25px 0; padding:5px 0;">
<h3 style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:18px; color:#333; margin-bottom:15px;">Event Development Timeline</h3>
<div style="position:relative; margin-left:30px; padding-left:30px; border-left:2px solid #e0e0e0;">
<!-- Time Point 1 -->
<div style="position:relative; margin-bottom:25px;">
<div style="position:absolute; width:16px; height:16px; background-color:#4285f4; border-radius:50%; left:-40px; top:0; border:3px solid #ffffff; box-shadow:0 2px 5px rgba(0,0,0,0.1);"></div>
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#4285f4;">June 1, 2023</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.5; color:#333;">Event description content, concisely explaining the key points and impact of the event.</p>
</div>
<!-- Time Point 2 -->
<div style="position:relative; margin-bottom:25px;">
<div style="position:absolute; width:16px; height:16px; background-color:#4285f4; border-radius:50%; left:-40px; top:0; border:3px solid #ffffff; box-shadow:0 2px 5px rgba(0,0,0,0.1);"></div>
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#4285f4;">June 15, 2023</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.5; color:#333;">Event description content, concisely explaining the key points and impact of the event.</p>
</div>
</div>
</div>
6. **Comparison Table** (for comparing different options or viewpoints):
<div style="margin:25px 0; padding:15px; background-color:#f8f9fa; border-radius:8px; overflow-x:auto;">
<table style="width:100%; border-collapse:collapse; font-family:'Google Sans',Roboto,Arial,sans-serif;">
<thead>
<tr>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Feature</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option A</th>
<th style="padding:12px 15px; text-align:left; border-bottom:2px solid #e0e0e0; color:#202124; font-weight:500;">Option B</th>
</tr>
</thead>
<tbody>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Cost</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Higher</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Moderate</td>
</tr>
<tr>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Efficiency</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Very High</td>
<td style="padding:12px 15px; border-bottom:1px solid #e0e0e0; color:#444;">Average</td>
</tr>
</tbody>
</table>
</div>
7. **Chart Data Processing**:
- Bar Chart/Horizontal Bars:
<div style="margin:20px 0; padding:15px; background-color:#f8f9fa; border-radius:8px;">
<p style="margin:0 0 15px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; font-weight:500; color:#333;">Data Comparison</p>
<!-- Item 1 -->
<div style="margin-bottom:12px;">
<div style="display:flex; align-items:center; justify-content:space-between; margin-bottom:5px;">
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#555;">Project A</span>
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#333;">65%</span>
</div>
<div style="height:10px; width:100%; background-color:#e8eaed; border-radius:5px; overflow:hidden;">
<div style="height:100%; width:65%; background:linear-gradient(to right, #4285f4, #5e97f6); border-radius:5px;"></div>
</div>
</div>
<!-- Item 2 -->
<div style="margin-bottom:12px;">
<div style="display:flex; align-items:center; justify-content:space-between; margin-bottom:5px;">
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; color:#555;">Project B</span>
<span style="font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:14px; font-weight:500; color:#333;">42%</span>
</div>
<div style="height:10px; width:100%; background-color:#e8eaed; border-radius:5px; overflow:hidden;">
<div style="height:100%; width:42%; background:linear-gradient(to right, #ea4335, #f07575); border-radius:5px;"></div>
</div>
</div>
</div>
8. **Highlight Box** (for displaying tips or reminders):
<div style="margin:25px 0; padding:20px; background-color:#fffde7; border-radius:8px; border-left:4px solid #fdd835; box-shadow:0 1px 5px rgba(0,0,0,0.05);">
<div style="display:flex; align-items:flex-start;">
<div style="flex-shrink:0; margin-right:15px; width:24px; height:24px; background-color:#fdd835; border-radius:50%; display:flex; align-items:center; justify-content:center;">
<span style="color:#fff; font-weight:bold; font-size:16px;">!</span>
</div>
<div>
<p style="margin:0 0 5px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; font-weight:500; color:#333;">Tip</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#555;">
Here are some additional tips or suggestions to help readers better understand or apply the article content.
</p>
</div>
</div>
</div>
9. **Summary Box**:
<div style="margin:25px 0; padding:20px; background-color:#f2f7fd; border-radius:8px; box-shadow:0 1px 5px rgba(66,133,244,0.1);">
<p style="margin:0 0 10px 0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:16px; font-weight:500; color:#1a73e8;">In Simple Terms</p>
<p style="margin:0; font-family:'Google Sans',Roboto,Arial,sans-serif; font-size:15px; line-height:1.6; color:#333;">
This is a concise summary of the entire content, highlighting the most critical findings and conclusions.
</p>
</div>
## Notes:
1. **Only generate content for a single article**, not including title and source, and not including HTML head and tail structure
2. Content should be **200-300 words**, don't force word count
3. **Must use multiple visual elements** (at least 2-3 types), avoid monotonous pure text presentation
4. Use relaxed, natural language, as if chatting with friends
5. Create visual charts for important data, rather than just describing with text
6. Use quote blocks to highlight important viewpoints, and lists to organize multiple points
7. Appropriately use emojis and conversational expressions to increase friendliness
8. Note that the article content has been provided in the previous message, please reply directly, no explanation, no comments, no markups
`,
}
// --- Factory code block ---
type Factory component.Factory[Rewriter, config.App, Dependencies]
func NewFactory(mockOn ...component.MockOption) Factory {
if len(mockOn) > 0 {
return component.FactoryFunc[Rewriter, config.App, Dependencies](func(instance string, app *config.App, dependencies Dependencies) (Rewriter, error) {
m := &mockRewriter{}
component.MockOptions(mockOn).Apply(&m.Mock)
return m, nil
})
}
return component.FactoryFunc[Rewriter, config.App, Dependencies](new)
}
func new(instance string, app *config.App, dependencies Dependencies) (Rewriter, error) {
c := &Config{}
c.From(app)
if err := c.Validate(); err != nil {
return nil, errors.Wrap(err, "validate and adjust rewrite config")
}
return &rewriter{
Base: component.New(&component.BaseConfig[Config, Dependencies]{
Name: "Rewriter",
Instance: instance,
Config: c,
Dependencies: dependencies,
}),
}, nil
}
// --- Implementation code block ---
type rewriter struct {
*component.Base[Config, Dependencies]
}
func (r *rewriter) Reload(app *config.App) error {
newConfig := &Config{}
newConfig.From(app)
if err := newConfig.Validate(); err != nil {
return errors.Wrap(err, "validate and adjust rewrite config")
}
r.SetConfig(newConfig)
return nil
}
func (r *rewriter) Labels(ctx context.Context, labels model.Labels) (rewritten model.Labels, err error) {
ctx = telemetry.StartWith(ctx, append(r.TelemetryLabels(), telemetrymodel.KeyOperation, "Labels")...)
defer func() { telemetry.End(ctx, err) }()
rules := *r.Config()
for _, rule := range rules {
// Get source text based on source label.
sourceText := labels.Get(rule.SourceLabel)
if utf8.RuneCountInString(sourceText) < *rule.SkipTooShortThreshold {
continue
}
// Transform text if configured.
text := sourceText
if rule.Transform != nil {
transformed, err := r.transformText(ctx, rule.Transform, sourceText)
if err != nil {
return nil, errors.Wrap(err, "transform text")
}
text = transformed
}
// Check if text matches the rule.
if !rule.matchRE.MatchString(text) {
continue
}
// Handle actions.
switch rule.Action {
case ActionDropFeed:
return nil, nil
case ActionCreateOrUpdateLabel:
labels.Put(rule.Label, text, false)
}
}
labels.EnsureSorted()
return labels, nil
}
// transformText transforms text using configured LLM.
func (r *rewriter) transformText(ctx context.Context, transform *Transform, text string) (string, error) {
// Get LLM instance.
llm := r.Dependencies().LLMFactory.Get(transform.ToText.LLM)
// Call completion.
result, err := llm.String(ctx, []string{
transform.ToText.promptRendered,
"The content to be processed is below, and the processing requirements are as above",
text, // TODO: may place to first line to hit the model cache in different rewrite rules.
})
if err != nil {
return "", errors.Wrap(err, "llm completion")
}
return r.transformTextHack(result), nil
}
func (r *rewriter) transformTextHack(text string) string {
bytes := unsafe.Slice(unsafe.StringData(text), len(text))
start := 0
end := len(bytes)
// Remove the last line if it's empty.
// This is a hack to avoid the model output a empty line.
// E.g. category: tech\n
if end > 0 && bytes[end-1] == '\n' {
end--
}
// Remove the html backticks.
if end-start >= 7 && string(bytes[start:start+7]) == "```html" {
start += 7
}
if end-start >= 3 && string(bytes[end-3:end]) == "```" {
end -= 3
}
// If no changes, return the original string.
if start == 0 && end == len(bytes) {
return text
}
// Only copy one time.
return string(bytes[start:end])
}
type mockRewriter struct {
component.Mock
}
func (r *mockRewriter) Reload(app *config.App) error {
args := r.Called(app)
return args.Error(0)
}
func (r *mockRewriter) Labels(ctx context.Context, labels model.Labels) (model.Labels, error) {
args := r.Called(ctx, labels)
if args.Get(0) == nil {
return nil, args.Error(1)
}
return args.Get(0).(model.Labels), args.Error(1)
}