181 lines
4.6 KiB
Go
181 lines
4.6 KiB
Go
// Copyright (C) 2025 wangyusong
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Affero General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
// along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
|
|
package scraper
|
|
|
|
import (
|
|
"context"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/mmcdole/gofeed"
|
|
"github.com/pkg/errors"
|
|
"github.com/stretchr/testify/mock"
|
|
|
|
"github.com/glidea/zenfeed/pkg/model"
|
|
textconvert "github.com/glidea/zenfeed/pkg/util/text_convert"
|
|
)
|
|
|
|
// --- Interface code block ---
|
|
type ScrapeSourceRSS struct {
|
|
URL string
|
|
RSSHubEndpoint string
|
|
RSSHubRoutePath string
|
|
RSSHubAccessKey string
|
|
}
|
|
|
|
func (c *ScrapeSourceRSS) Validate() error {
|
|
if c.URL == "" && c.RSSHubEndpoint == "" {
|
|
return errors.New("URL or RSSHubEndpoint can not be empty at the same time")
|
|
}
|
|
if c.URL == "" {
|
|
c.URL = strings.TrimSuffix(c.RSSHubEndpoint, "/") + "/" + strings.TrimPrefix(c.RSSHubRoutePath, "/")
|
|
}
|
|
if c.URL != "" && !strings.HasPrefix(c.URL, "http://") && !strings.HasPrefix(c.URL, "https://") {
|
|
return errors.New("URL must be a valid HTTP/HTTPS URL")
|
|
}
|
|
|
|
// Append access key as query parameter if provided
|
|
if c.RSSHubAccessKey != "" && !strings.Contains(c.URL, "key=") {
|
|
if strings.Contains(c.URL, "?") {
|
|
c.URL += "&key=" + c.RSSHubAccessKey
|
|
} else {
|
|
c.URL += "?key=" + c.RSSHubAccessKey
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// --- Factory code block ---
|
|
func newRSSReader(config *ScrapeSourceRSS) (reader, error) {
|
|
if err := config.Validate(); err != nil {
|
|
return nil, errors.Wrapf(err, "invalid RSS config")
|
|
}
|
|
|
|
return &rssReader{
|
|
config: config,
|
|
client: &gofeedClient{
|
|
url: config.URL,
|
|
base: gofeed.NewParser(),
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
// --- Implementation code block ---
|
|
type rssReader struct {
|
|
config *ScrapeSourceRSS
|
|
client client
|
|
}
|
|
|
|
func (r *rssReader) Read(ctx context.Context) ([]*model.Feed, error) {
|
|
feed, err := r.client.Get(ctx)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "fetching RSS feed")
|
|
}
|
|
if len(feed.Items) == 0 {
|
|
return []*model.Feed{}, nil
|
|
}
|
|
|
|
now := clk.Now()
|
|
feeds := make([]*model.Feed, 0, len(feed.Items))
|
|
for _, fi := range feed.Items {
|
|
item, err := r.toResultFeed(now, fi)
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "converting feed item")
|
|
}
|
|
|
|
feeds = append(feeds, item)
|
|
}
|
|
|
|
return feeds, nil
|
|
}
|
|
|
|
func (r *rssReader) toResultFeed(now time.Time, feedFeed *gofeed.Item) (*model.Feed, error) {
|
|
content := r.combineContent(feedFeed.Content, feedFeed.Description)
|
|
|
|
// Ensure the content is markdown.
|
|
mdContent, err := textconvert.HTMLToMarkdown([]byte(content))
|
|
if err != nil {
|
|
return nil, errors.Wrapf(err, "converting content to markdown")
|
|
}
|
|
|
|
// Create the feed item.
|
|
feed := &model.Feed{
|
|
Labels: model.Labels{
|
|
{Key: model.LabelType, Value: "rss"},
|
|
{Key: model.LabelTitle, Value: feedFeed.Title},
|
|
{Key: model.LabelLink, Value: feedFeed.Link},
|
|
{Key: model.LabelPubTime, Value: r.parseTime(feedFeed).Format(time.RFC3339)},
|
|
{Key: model.LabelContent, Value: string(mdContent)},
|
|
},
|
|
Time: now,
|
|
}
|
|
|
|
return feed, nil
|
|
}
|
|
|
|
// parseTime parses the publication time from the feed item.
|
|
// If the feed item does not have a publication time, it returns the current time.
|
|
func (r *rssReader) parseTime(feedFeed *gofeed.Item) time.Time {
|
|
if feedFeed.PublishedParsed == nil {
|
|
return clk.Now().In(time.Local)
|
|
}
|
|
|
|
return feedFeed.PublishedParsed.In(time.Local)
|
|
}
|
|
|
|
// combineContent combines Content and Description fields with proper formatting.
|
|
func (r *rssReader) combineContent(content, description string) string {
|
|
switch {
|
|
case content == "":
|
|
return description
|
|
case description == "":
|
|
return content
|
|
default:
|
|
return strings.Join([]string{description, content}, "\n\n")
|
|
}
|
|
}
|
|
|
|
type client interface {
|
|
Get(ctx context.Context) (*gofeed.Feed, error)
|
|
}
|
|
|
|
type gofeedClient struct {
|
|
url string
|
|
base *gofeed.Parser
|
|
}
|
|
|
|
func (c *gofeedClient) Get(ctx context.Context) (*gofeed.Feed, error) {
|
|
return c.base.ParseURLWithContext(c.url, ctx)
|
|
}
|
|
|
|
type mockClient struct {
|
|
mock.Mock
|
|
}
|
|
|
|
func newMockClient() *mockClient {
|
|
return &mockClient{}
|
|
}
|
|
|
|
func (c *mockClient) Get(ctx context.Context) (*gofeed.Feed, error) {
|
|
args := c.Called(ctx)
|
|
if args.Error(1) != nil {
|
|
return nil, args.Error(1)
|
|
}
|
|
|
|
return args.Get(0).(*gofeed.Feed), nil
|
|
}
|