init
This commit is contained in:
171
pkg/scrape/scraper/rss.go
Normal file
171
pkg/scrape/scraper/rss.go
Normal file
@@ -0,0 +1,171 @@
|
||||
// Copyright (C) 2025 wangyusong
|
||||
//
|
||||
// This program is free software: you can redistribute it and/or modify
|
||||
// it under the terms of the GNU Affero General Public License as published by
|
||||
// the Free Software Foundation, either version 3 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU Affero General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU Affero General Public License
|
||||
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
package scraper
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/mmcdole/gofeed"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/stretchr/testify/mock"
|
||||
|
||||
"github.com/glidea/zenfeed/pkg/model"
|
||||
textconvert "github.com/glidea/zenfeed/pkg/util/text_convert"
|
||||
)
|
||||
|
||||
// --- Interface code block ---
|
||||
type ScrapeSourceRSS struct {
|
||||
URL string
|
||||
RSSHubEndpoint string
|
||||
RSSHubRoutePath string
|
||||
}
|
||||
|
||||
func (c *ScrapeSourceRSS) Validate() error {
|
||||
if c.URL == "" && c.RSSHubEndpoint == "" {
|
||||
return errors.New("URL or RSSHubEndpoint can not be empty at the same time")
|
||||
}
|
||||
if c.URL == "" {
|
||||
c.URL = strings.TrimSuffix(c.RSSHubEndpoint, "/") + "/" + strings.TrimPrefix(c.RSSHubRoutePath, "/")
|
||||
}
|
||||
if c.URL != "" && !strings.HasPrefix(c.URL, "http://") && !strings.HasPrefix(c.URL, "https://") {
|
||||
return errors.New("URL must be a valid HTTP/HTTPS URL")
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// --- Factory code block ---
|
||||
func newRSSReader(config *ScrapeSourceRSS) (reader, error) {
|
||||
if err := config.Validate(); err != nil {
|
||||
return nil, errors.Wrapf(err, "invalid RSS config")
|
||||
}
|
||||
|
||||
return &rssReader{
|
||||
config: config,
|
||||
client: &gofeedClient{
|
||||
url: config.URL,
|
||||
base: gofeed.NewParser(),
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// --- Implementation code block ---
|
||||
|
||||
type rssReader struct {
|
||||
config *ScrapeSourceRSS
|
||||
client client
|
||||
}
|
||||
|
||||
func (r *rssReader) Read(ctx context.Context) ([]*model.Feed, error) {
|
||||
feed, err := r.client.Get(ctx)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "fetching RSS feed")
|
||||
}
|
||||
if len(feed.Items) == 0 {
|
||||
return []*model.Feed{}, nil
|
||||
}
|
||||
|
||||
now := clk.Now()
|
||||
feeds := make([]*model.Feed, 0, len(feed.Items))
|
||||
for _, fi := range feed.Items {
|
||||
item, err := r.toResultFeed(now, fi)
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "converting feed item")
|
||||
}
|
||||
|
||||
feeds = append(feeds, item)
|
||||
}
|
||||
|
||||
return feeds, nil
|
||||
}
|
||||
|
||||
func (r *rssReader) toResultFeed(now time.Time, feedFeed *gofeed.Item) (*model.Feed, error) {
|
||||
content := r.combineContent(feedFeed.Content, feedFeed.Description)
|
||||
|
||||
// Ensure the content is markdown.
|
||||
mdContent, err := textconvert.HTMLToMarkdown([]byte(content))
|
||||
if err != nil {
|
||||
return nil, errors.Wrapf(err, "converting content to markdown")
|
||||
}
|
||||
|
||||
// Create the feed item.
|
||||
feed := &model.Feed{
|
||||
Labels: model.Labels{
|
||||
{Key: model.LabelType, Value: "rss"},
|
||||
{Key: model.LabelTitle, Value: feedFeed.Title},
|
||||
{Key: model.LabelLink, Value: feedFeed.Link},
|
||||
{Key: model.LabelPubTime, Value: r.parseTime(feedFeed).Format(time.RFC3339)},
|
||||
{Key: model.LabelContent, Value: string(mdContent)},
|
||||
},
|
||||
Time: now,
|
||||
}
|
||||
|
||||
return feed, nil
|
||||
}
|
||||
|
||||
// parseTime parses the publication time from the feed item.
|
||||
// If the feed item does not have a publication time, it returns the current time.
|
||||
func (r *rssReader) parseTime(feedFeed *gofeed.Item) time.Time {
|
||||
if feedFeed.PublishedParsed == nil {
|
||||
return clk.Now().In(time.Local)
|
||||
}
|
||||
|
||||
return feedFeed.PublishedParsed.In(time.Local)
|
||||
}
|
||||
|
||||
// combineContent combines Content and Description fields with proper formatting.
|
||||
func (r *rssReader) combineContent(content, description string) string {
|
||||
switch {
|
||||
case content == "":
|
||||
return description
|
||||
case description == "":
|
||||
return content
|
||||
default:
|
||||
return strings.Join([]string{description, content}, "\n\n")
|
||||
}
|
||||
}
|
||||
|
||||
type client interface {
|
||||
Get(ctx context.Context) (*gofeed.Feed, error)
|
||||
}
|
||||
|
||||
type gofeedClient struct {
|
||||
url string
|
||||
base *gofeed.Parser
|
||||
}
|
||||
|
||||
func (c *gofeedClient) Get(ctx context.Context) (*gofeed.Feed, error) {
|
||||
return c.base.ParseURLWithContext(c.url, ctx)
|
||||
}
|
||||
|
||||
type mockClient struct {
|
||||
mock.Mock
|
||||
}
|
||||
|
||||
func newMockClient() *mockClient {
|
||||
return &mockClient{}
|
||||
}
|
||||
|
||||
func (c *mockClient) Get(ctx context.Context) (*gofeed.Feed, error) {
|
||||
args := c.Called(ctx)
|
||||
if args.Error(1) != nil {
|
||||
return nil, args.Error(1)
|
||||
}
|
||||
|
||||
return args.Get(0).(*gofeed.Feed), nil
|
||||
}
|
||||
Reference in New Issue
Block a user