feat(数据源): 添加Reddit数据源并优化现有功能

新增Reddit数据源支持，包括获取、翻译和展示功能优化Twitter数据源显示逻辑添加fetch请求超时处理调整数据源配置和分类更新页脚链接和图片地址
2025-07-15 17:56:40 +08:00
parent aa716421c9
commit 8374c32ae5
9 changed files with 315 additions and 18 deletions
--- a/src/chatapi.js
+++ b/src/chatapi.js
@@ -32,7 +32,7 @@ async function callGeminiChatAPI(env, promptText, systemPromptText = null) {
    }

    try {
-        const response = await fetch(url, {
+        const response = await fetchWithTimeout(url, {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify(payload)
@@ -137,7 +137,7 @@ async function* callGeminiChatAPIStream(env, promptText, systemPromptText = null

    let response;
    try {
-        response = await fetch(url, {
+        response = await fetchWithTimeout(url, {
            method: 'POST',
            headers: { 'Content-Type': 'application/json' },
            body: JSON.stringify(payload)
@@ -334,7 +334,7 @@ async function callOpenAIChatAPI(env, promptText, systemPromptText = null) {
    };

    try {
-        const response = await fetch(url, {
+        const response = await fetchWithTimeout(url, {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json',
@@ -413,7 +413,7 @@ async function* callOpenAIChatAPIStream(env, promptText, systemPromptText = null

    let response;
    try {
-        response = await fetch(url, {
+        response = await fetchWithTimeout(url, {
            method: 'POST',
            headers: {
                'Content-Type': 'application/json',
@@ -565,3 +565,34 @@ export async function* callChatAPIStream(env, promptText, systemPromptText = nul
        yield* callGeminiChatAPIStream(env, promptText, systemPromptText);
    }
 }
+
+
+/**
+ * 带有超时功能的 fetch 封装
+ * @param {string} resource fetch 的请求 URL
+ * @param {object} options fetch 的配置对象
+ * @param {number} timeout 超时时间，单位毫秒
+ * @returns {Promise<Response>}
+ */
+async function fetchWithTimeout(resource, options = {}, timeout = 60000) {
+  const controller = new AbortController();
+  const id = setTimeout(() => controller.abort(), timeout);
+
+  try {
+    const response = await fetch(resource, {
+      ...options,
+      signal: controller.signal  // 关联 AbortController
+    });
+    return response;
+  } catch (error) {
+    // 当 abort() 被调用时，fetch 会抛出一个 AbortError
+    if (error.name === 'AbortError') {
+      throw new Error('Request timed out');
+    }
+    // 其他网络错误等
+    throw error;
+  } finally {
+    // 清除计时器，防止内存泄漏
+    clearTimeout(id);
+  }
+}
--- a/src/dataFetchers.js
+++ b/src/dataFetchers.js
@@ -7,13 +7,14 @@ import QBitDataSource from './dataSources/qbit.js';
 import JiqizhixinDataSource from './dataSources/jiqizhixin.js';
 import XiaohuDataSource from './dataSources/xiaohu.js';
 import TwitterDataSource from './dataSources/twitter.js';
+import RedditDataSource from './dataSources/reddit.js';

 // Register data sources as arrays to support multiple sources per type
 export const dataSources = {
-    news: { name: '新闻', sources: [AibaseDataSource, XiaohuDataSource] },
+    news: { name: '新闻', sources: [AibaseDataSource, XiaohuDataSource, QBitDataSource, XinZhiYuanDataSource] },
    project: { name: '项目', sources: [GithubTrendingDataSource] },
-    paper: { name: '论文', sources: [HuggingfacePapersDataSource, XinZhiYuanDataSource, QBitDataSource, JiqizhixinDataSource] },
-    socialMedia: { name: '社交平台', sources: [TwitterDataSource] },
+    paper: { name: '论文', sources: [HuggingfacePapersDataSource, JiqizhixinDataSource] },
+    socialMedia: { name: '社交平台', sources: [TwitterDataSource, RedditDataSource] },
    // Add new data sources here as arrays, e.g.,
    // newType: { name: '新类型', sources: [NewTypeDataSource1, NewTypeDataSource2] },
 };
--- a/src/dataSources/reddit.js
+++ b/src/dataSources/reddit.js
@@ -0,0 +1,200 @@
+import { getRandomUserAgent, sleep, isDateWithinLastDays, stripHtml, formatDateToChineseWithTime, escapeHtml} from '../helpers';
+import { callChatAPI } from '../chatapi.js';
+import { removeMarkdownCodeBlock } from '../helpers.js';
+
+const RedditDataSource = {
+    async fetch(env, foloCookie) {
+        const listId = env.REDDIT_LIST_ID;
+        const fetchPages = parseInt(env.REDDIT_FETCH_PAGES || '3', 10);
+        const allRedditItems = [];
+        const filterDays = parseInt(env.FOLO_FILTER_DAYS || '3', 10);
+
+        if (!listId) {
+            console.error('REDDIT_LIST_ID is not set in environment variables.');
+            return {
+                version: "https://jsonfeed.org/version/1.1",
+                title: "Reddit Feeds",
+                home_page_url: "https://www.reddit.com/",
+                description: "Aggregated Reddit feeds from various subreddits/users",
+                language: "zh-cn",
+                items: []
+            };
+        }
+
+        let publishedAfter = null;
+        for (let i = 0; i < fetchPages; i++) {
+            const userAgent = getRandomUserAgent();
+            const headers = {
+                'User-Agent': userAgent,
+                'Content-Type': 'application/json',
+                'accept': 'application/json',
+                'accept-language': 'zh-CN,zh;q=0.9',
+                'baggage': 'sentry-environment=stable,sentry-release=5251fa921ef6cbb6df0ac4271c41c2b4a0ce7c50,sentry-public_key=e5bccf7428aa4e881ed5cb713fdff181,sentry-trace_id=2da50ca5ad944cb794670097d876ada8,sentry-sampled=true,sentry-sample_rand=0.06211835167903246,sentry-sample_rate=1',
+                'origin': 'https://app.follow.is',
+                'priority': 'u=1, i',
+                'sec-ch-ua': '"Google Chrome";v="135", "Not-A.Brand";v="8", "Chromium";v="135"',
+                'sec-ch-ua-mobile': '?1',
+                'sec-ch-ua-platform': '"Android"',
+                'sec-fetch-dest': 'empty',
+                'sec-fetch-mode': 'cors',
+                'sec-fetch-site': 'same-site',
+                'x-app-name': 'Folo Web',
+                'x-app-version': '0.4.9',
+            };
+
+            if (foloCookie) {
+                headers['Cookie'] = foloCookie;
+            }
+
+            const body = {
+                listId: listId,
+                view: 1,
+                withContent: true,
+            };
+
+            if (publishedAfter) {
+                body.publishedAfter = publishedAfter;
+            }
+
+            try {
+                console.log(`Fetching Reddit data, page ${i + 1}...`);
+                const response = await fetch(env.FOLO_DATA_API, {
+                    method: 'POST',
+                    headers: headers,
+                    body: JSON.stringify(body),
+                });
+
+                if (!response.ok) {
+                    console.error(`Failed to fetch Reddit data, page ${i + 1}: ${response.statusText}`);
+                    break;
+                }
+                const data = await response.json();
+                if (data && data.data && data.data.length > 0) {
+                    const filteredItems = data.data.filter(entry => isDateWithinLastDays(entry.entries.publishedAt, filterDays));
+                    allRedditItems.push(...filteredItems.map(entry => ({
+                        id: entry.entries.id,
+                        url: entry.entries.url,
+                        title: entry.entries.title,
+                        content_html: entry.entries.content,
+                        date_published: entry.entries.publishedAt,
+                        authors: [{ name: entry.entries.author }],
+                        source: `${entry.feeds.title}` ,
+                    })));
+                    publishedAfter = data.data[data.data.length - 1].entries.publishedAt;
+                } else {
+                    console.log(`No more data for Reddit, page ${i + 1}.`);
+                    break;
+                }
+            } catch (error) {
+                console.error(`Error fetching Reddit data, page ${i + 1}:`, error);
+                break;
+            }
+
+            await sleep(Math.random() * 5000);
+        }
+
+        const redditData = {
+            version: "https://jsonfeed.org/version/1.1",
+            title: "Reddit Feeds",
+            home_page_url: "https://www.reddit.com/",
+            description: "Aggregated Reddit feeds from various subreddits/users",
+            language: "zh-cn",
+            items: allRedditItems
+        };
+
+        if (redditData.items.length === 0) {
+            console.log("No reddit posts found for today or after filtering.");
+            return redditData;
+        }
+
+        if (!env.OPEN_TRANSLATE === "true") {
+            console.warn("Skipping reddit translations.");
+            redditData.items = redditData.items.map(item => ({
+                ...item,
+                title_zh: item.title || ""
+            }));
+            return redditData;
+        }
+
+        const itemsToTranslate = redditData.items.map((item, index) => ({
+            id: index,
+            original_title: item.title || ""
+        }));
+
+        const hasContentToTranslate = itemsToTranslate.some(item => item.original_title.trim() !== "");
+        if (!hasContentToTranslate) {
+            console.log("No non-empty reddit titles to translate for today's posts.");
+            redditData.items = redditData.items.map(item => ({ ...item, title_zh: item.title || "" }));
+            return redditData;
+        }
+
+        const promptText = `You will be given a JSON array of reddit data objects. Each object has an "id" and "original_title".
+Translate "original_title" into Chinese.
+Return a JSON array of objects. Each output object MUST have:
+- "id": The same id from the input.
+- "title_zh": Chinese translation of "original_title". Empty if original is empty.
+Input: ${JSON.stringify(itemsToTranslate)}
+Respond ONLY with the JSON array.`;
+
+        let translatedItemsMap = new Map();
+        try {
+            console.log(`Requesting translation for ${itemsToTranslate.length} reddit titles for today.`);
+            const chatResponse = await callChatAPI(env, promptText);
+            const parsedTranslations = JSON.parse(removeMarkdownCodeBlock(chatResponse));
+
+            if (parsedTranslations) {
+                parsedTranslations.forEach(translatedItem => {
+                    if (translatedItem && typeof translatedItem.id === 'number' &&
+                        typeof translatedItem.title_zh === 'string') {
+                        translatedItemsMap.set(translatedItem.id, translatedItem);
+                    }
+                });
+            }
+        } catch (translationError) {
+            console.error("Failed to translate reddit titles in batch:", translationError.message);
+        }
+
+        redditData.items = redditData.items.map((originalItem, index) => {
+            const translatedData = translatedItemsMap.get(index);
+            return {
+                ...originalItem,
+                title_zh: translatedData ? translatedData.title_zh : (originalItem.title || "")
+            };
+        });
+
+        return redditData;
+    },
+
+    transform(rawData, sourceType) {
+        if (!rawData || !rawData.items) {
+            return [];
+        }
+
+        return rawData.items.map(item => ({
+            id: item.id,
+            type: sourceType,
+            url: item.url,
+            title: item.title_zh || item.title, // Use translated title if available
+            description: stripHtml(item.content_html || ""),
+            published_date: item.date_published,
+            authors: item.authors ? item.authors.map(author => author.name).join(', ') : 'Unknown',
+            source: item.source || 'reddit',
+            details: {
+                content_html: item.content_html || ""
+            }
+        }));
+    },
+
+    generateHtml: (item) => {
+        return `
+            <strong>${escapeHtml(item.title)}</strong><br>
+            <small>来源: ${escapeHtml(item.source || '未知')} | 发布日期: ${formatDateToChineseWithTime(item.published_date)}</small>
+            <div class="content-html">
+                ${item.details.content_html || '无内容。'}
+            </div>
+            <a href="${escapeHtml(item.url)}" target="_blank" rel="noopener noreferrer">查看 Reddit 帖子</a>
+        `;
+    }
+};
+
+export default RedditDataSource;
--- a/src/dataSources/twitter.js
+++ b/src/dataSources/twitter.js
@@ -77,7 +77,7 @@ const TwitterDataSource = {
                        content_html: entry.entries.content,
                        date_published: entry.entries.publishedAt,
                        authors: [{ name: entry.entries.author }],
-                        source: entry.feeds.title && entry.feeds.title.includes('即刻圈子') ? `${entry.feeds.title} - ${entry.entries.author}` : `twitter-${entry.entries.author}`,
+                        source: entry.feeds.title && entry.feeds.title.startsWith('Twitter') ? `twitter-${entry.entries.author}` : `${entry.feeds.title} - ${entry.entries.author}` ,
                    })));
                    publishedAfter = data.data[data.data.length - 1].entries.publishedAt;
                } else {
--- a/src/foot.js
+++ b/src/foot.js
@@ -7,8 +7,8 @@ export function insertFoot() {

 | 🎙️ **小宇宙** | 📹 **抖音** |
 | --- | --- |
-| [来生小酒馆](https://www.xiaoyuzhoufm.com/podcast/683c62b7c1ca9cf575a5030e)  |   [来生情报站](https://www.douyin.com/user/MS4wLjABAAAAwpwqPQlu38sO38VyWgw9ZjDEnN4bMR5j8x111UxpseHR9DpB6-CveI5KRXOWuFwG)| 
-| ![小酒馆](https://s1.imagehub.cc/images/2025/06/24/f959f7984e9163fc50d3941d79a7f262.md.png) | ![情报站](https://s1.imagehub.cc/images/2025/06/24/7fc30805eeb831e1e2baa3a240683ca3.md.png) |
+| [来生小酒馆](https://www.xiaoyuzhoufm.com/podcast/683c62b7c1ca9cf575a5030e)  |   [自媒体账号](https://www.douyin.com/user/MS4wLjABAAAAwpwqPQlu38sO38VyWgw9ZjDEnN4bMR5j8x111UxpseHR9DpB6-CveI5KRXOWuFwG)| 
+| ![小酒馆](https://cdn.jsdmirror.com/gh/justlovemaki/imagehub@main/logo/f959f7984e9163fc50d3941d79a7f262.md.png) | ![情报站](https://cdn.jsdmirror.com/gh/justlovemaki/imagehub@main/logo/7fc30805eeb831e1e2baa3a240683ca3.md.png) |

    `;
 }
--- a/src/handlers/commitToGitHub.js
+++ b/src/handlers/commitToGitHub.js
@@ -28,7 +28,7 @@ export async function handleCommitToGitHub(request, env) {

        if (dailyMd) {
            filesToCommit.push({ path: `daily/${dateStr}.md`, content: formatMarkdownText(dailyMd), description: "Daily Summary File" });
-            report.content_html = marked.parse(formatMarkdownText(replaceImageProxy(env.IMG_PROXY, dailyMd)));
+            report.content_html = marked.parse(formatMarkdownText(env.IMG_PROXY, dailyMd));
            storeInKV(env.DATA_KV, `${dateStr}-report`, report);
        }
        if (podcastMd) {
--- a/src/handlers/writeRssData.js
+++ b/src/handlers/writeRssData.js
@@ -2,22 +2,31 @@ import { replaceImageProxy, formatMarkdownText, formatDateToGMT12WithTime } from
 import { getDailyReportContent } from '../github.js';
 import { storeInKV } from '../kv.js';
 import { marked } from '../marked.esm.js';
+import { callChatAPI } from '../chatapi.js'; // 导入 callChatAPI
+import { getSummarizationSimplifyPrompt } from "../prompt/summarizationSimplifyPrompt";

 export async function handleWriteRssData(request, env) {
    const url = new URL(request.url);
    const dateStr = url.searchParams.get('date');
+    console.log(`[writeRssData] Received request for date: ${dateStr}`);

    if (!dateStr) {
+        console.error('[writeRssData] Missing date parameter');
        return new Response('Missing date parameter', { status: 400 });
    }

    try {
        const path = `daily/${dateStr}.md`;
-        const content = await getDailyReportContent(env, path);
+        console.log(`[writeRssData] Attempting to get content from GitHub path: ${path}`);
+        let content = await getDailyReportContent(env, path);
+        
        if (!content) {
+            console.warn(`[writeRssData] No content found for ${path}. Returning 404.`);
            return new Response(`No content found for ${path}`, { status: 404 });
        }
+        console.log(`[writeRssData] Successfully retrieved content for ${path}. Content length: ${content.length}`);

+        //content = extractContentFromSecondHash(content);
        // 从 "YYYY-MM-DD" 格式的 dateStr 中提取 "YYYY-MM"
        const yearMonth = dateStr.substring(0, 7);
        const report = {
@@ -28,15 +37,61 @@ export async function handleWriteRssData(request, env) {
            // 可以添加其他相關欄位，例如作者、來源等
            published_date: formatDateToGMT12WithTime(new Date()) // 記錄保存時間
        }
-        report.content_html = marked.parse(formatMarkdownText(replaceImageProxy(env.IMG_PROXY, content)));
-        storeInKV(env.DATA_KV, `${dateStr}-report`, report);
+        report.content_html = marked.parse(formatMarkdownText(replaceImageProxy(env, content)));
+        
+        const kvKey = `${dateStr}-report`;
+        console.log(`[writeRssData] Preparing to store report in KV. Key: ${kvKey}, Report object:`, JSON.stringify(report).substring(0, 200) + '...'); // Log first 200 chars
+        await storeInKV(env.DATA_KV, kvKey, report);
+        console.log(`[writeRssData] Successfully stored report in KV with key: ${kvKey}`);

        return new Response(JSON.stringify(report), {
            headers: { 'Content-Type': 'application/json' },
            status: 200
        });
    } catch (error) {
-        console.error('Error handling daily report:', error.message);
+        console.error('[writeRssData] Error handling daily report:', error.message, error.stack);
        return new Response(`Error handling daily report: ${error.message}`, { status: 500 });
    }
-}
+}
+
+/**
+ * 从第二个 ### 开始截取内容，包括 ###。
+ *
+ * @param {string} content - 原始文本内容。
+ * @returns {string} 截取后的内容。
+ */
+export function extractContentFromSecondHash(content) {
+    const parts = content.split('###');
+    if (parts.length > 2) {
+        // 原始逻辑：重新组合从第二个 ### 开始的所有部分
+        const newcontent = '###' + parts.slice(2).join('###');
+        const lastHashIndex = newcontent.lastIndexOf('###');
+        if (lastHashIndex !== -1) {
+            return newcontent.substring(0, lastHashIndex);
+        }
+    }
+    return content; // 如果没有找到 ### 或不符合上述条件，则返回原始内容
+}
+
+/**
+ * 调用 Gemini 或 OpenAI 模型生成指定提示词的内容。
+ * 此方法可供外部调用。
+ *
+ * @param {object} env - 环境对象，包含 AI 模型相关的配置。
+ * @param {string} promptText - 用户提示词。
+ * @returns {Promise<string>} AI 模型生成的内容。
+ * @throws {Error} 如果 API 调用失败或返回空内容。
+ */
+export async function generateAIContent(env, promptText) {
+    console.log(`[generateAIContent] Calling AI model with prompt: ${promptText.substring(0, 100)}...`);
+    try {
+        let result = await callChatAPI(env, promptText, getSummarizationSimplifyPrompt());
+        console.log(`[generateAIContent] AI model returned content. Length: ${result.length}`);
+
+        result += "\n\n </br>"+env.INSERT_APP_URL;
+        return result;
+    } catch (error) {
+        console.error('[generateAIContent] Error calling AI model:', error.message, error.stack);
+        throw new Error(`Failed to generate AI content: ${error.message}`);
+    }
+}
--- a/src/prompt/summarizationSimplifyPrompt.js
+++ b/src/prompt/summarizationSimplifyPrompt.js
@@ -0,0 +1,7 @@
+// Add new data sources
+export function getSummarizationSimplifyPrompt() {
+    return `
+简化每一段的文字为一句话描述，每句话不超过30个字，将所有的句子过渡词和连接词替换为最基础、最常用的词语。尽量使用简单、直接的表达方式，避免使用复杂或生僻的词汇。确保句子之间的逻辑关系清晰。
+可以合并同类的输出信息，保持原有的小标题，为生成后的每一段内容从1开始排序.
+    `;
+}
--- a/wrangler.toml
+++ b/wrangler.toml
@@ -34,7 +34,9 @@ QBIT_FETCH_PAGES = "1"
 XINZHIYUAN_FEED_ID = "60901577013168128" 
 XINZHIYUAN_FETCH_PAGES = "1" 
 TWITTER_LIST_ID = "153028784690326528" 
-TWITTER_FETCH_PAGES = "5" 
+TWITTER_FETCH_PAGES = "2" 
+REDDIT_LIST_ID = "167576006499975168" 
+REDDIT_FETCH_PAGES = "2" 
 PROJECTS_API_URL = "https://git-trending.justlikemaki.vip/topone/?since=daily"
 GITHUB_TOKEN = "github_pat_xxxxxx"
 GITHUB_REPO_OWNER = "justlovemaki"
@@ -48,4 +50,5 @@ PODCAST_TITLE = "来生小酒馆"
 PODCAST_BEGIN = "嘿，亲爱的V，欢迎收听新一期的来生情报站，我是你们的老朋友，何夕2077"
 PODCAST_END = "今天的情报就到这里，注意隐蔽，赶紧撤离"
 BOOK_LINK = ""
-INSERT_FOOT = "false"
+INSERT_FOOT = "false"
+INSERT_APP_URL = "<h3>[查看完整版AI日报↗️ https://ai.hubtoday.app/](https://ai.hubtoday.app/)</h3>"