From fc5369a930e7a70485bcb9fd95217cd6a86804dc Mon Sep 17 00:00:00 2001 From: zihanjian Date: Mon, 13 Oct 2025 15:26:24 +0800 Subject: [PATCH] def --- commands/handlers.py | 226 ++++++++++++++++++++++++++++----------- function/func_summary.py | 188 +++++++++++++++++++------------- 2 files changed, 277 insertions(+), 137 deletions(-) diff --git a/commands/handlers.py b/commands/handlers.py index 59d7c84..171b244 100644 --- a/commands/handlers.py +++ b/commands/handlers.py @@ -166,17 +166,16 @@ def handle_chitchat(ctx: 'MessageContext', match: Optional[Match]) -> bool: "function": { "name": "search_chat_history", "description": ( - "Search recent conversation history for specific keywords. " - "Returns at most 20 recent segments, each including the matched message " - "with five surrounding messages (if available) and timestamps. " - "Use this tool when you need precise historical context." + "Search older conversation history (excluding the most recent 30 messages) using multiple related keywords. " + "Provide 2-4 diverse keywords or short phrases that capture the user's intent; synonyms and key names greatly improve recall. " + "The tool returns up to 20 recent segments, each containing deduplicated context lines that have already been formatted." ), "parameters": { "type": "object", "properties": { "keywords": { "type": "array", - "description": "List of keywords to search for in message content.", + "description": "Diverse keywords or short phrases (2-4 recommended) for fuzzy searching message content.", "items": {"type": "string"}, "minItems": 1 }, @@ -202,83 +201,186 @@ def handle_chitchat(ctx: 'MessageContext', match: Optional[Match]) -> bool: } } + range_history_tool = { + "type": "function", + "function": { + "name": "fetch_chat_history_range", + "description": ( + "Retrieve a slice of older conversation by specifying two offsets counted from the latest message " + "(e.g., start_offset=60, end_offset=120 fetches messages between the 60th and 120th most recent). " + "Both offsets must be greater than 30 so that only unseen history is retrieved. " + "Use when you need a contiguous block of messages instead of keyword search." + ), + "parameters": { + "type": "object", + "properties": { + "start_offset": { + "type": "integer", + "description": "Smaller offset counted from the latest message (must be >30)." + }, + "end_offset": { + "type": "integer", + "description": "Larger offset counted from the latest message (must be > start_offset and >30)." + } + }, + "required": ["start_offset", "end_offset"], + "additionalProperties": False + } + } + } + def handle_tool_call(tool_name: str, arguments: Dict[str, Any]) -> str: - if tool_name != "search_chat_history": - return json.dumps({"error": f"Unknown tool '{tool_name}'"}, ensure_ascii=False) - try: - keywords = arguments.get("keywords", []) - if isinstance(keywords, str): - keywords = [keywords] - elif not isinstance(keywords, list): - keywords = [] + if tool_name == "search_chat_history": + keywords = arguments.get("keywords", []) + if isinstance(keywords, str): + keywords = [keywords] + elif not isinstance(keywords, list): + keywords = [] - query = arguments.get("query") - if isinstance(query, str) and query.strip(): - query_keywords = [segment for segment in query.strip().split() if segment] - keywords.extend(query_keywords) + query = arguments.get("query") + if isinstance(query, str) and query.strip(): + query_keywords = [ + segment + for segment in re.split(r"[,\s,。;;]+", query.strip()) + if segment + ] + keywords.extend(query_keywords) - cleaned_keywords = [] - for kw in keywords: - if kw is None: - continue - kw_str = str(kw).strip() - if kw_str: - cleaned_keywords.append(kw_str) + cleaned_keywords = [] + for kw in keywords: + if kw is None: + continue + kw_str = str(kw).strip() + if kw_str: + if len(kw_str) == 1 and not kw_str.isdigit(): + continue + cleaned_keywords.append(kw_str) - # 去重同时保持顺序 - seen = set() - deduped_keywords = [] - for kw in cleaned_keywords: - lower_kw = kw.lower() - if lower_kw not in seen: - seen.add(lower_kw) - deduped_keywords.append(kw) + # 去重同时保持顺序 + seen = set() + deduped_keywords = [] + for kw in cleaned_keywords: + lower_kw = kw.lower() + if lower_kw not in seen: + seen.add(lower_kw) + deduped_keywords.append(kw) - if not deduped_keywords: - return json.dumps({"error": "No valid keywords provided.", "results": []}, ensure_ascii=False) + if not deduped_keywords: + return json.dumps({"error": "No valid keywords provided.", "results": []}, ensure_ascii=False) - context_window = arguments.get("context_window", 5) - max_results = arguments.get("max_results", 20) + context_window = arguments.get("context_window", 5) + max_results = arguments.get("max_results", 20) - print(f"[search_chat_history] chat_id={chat_id}, keywords={deduped_keywords}, " - f"context_window={context_window}, max_results={max_results}") - if ctx.logger: - ctx.logger.info( - f"[search_chat_history] keywords={deduped_keywords}, " - f"context_window={context_window}, max_results={max_results}" + print(f"[search_chat_history] chat_id={chat_id}, keywords={deduped_keywords}, " + f"context_window={context_window}, max_results={max_results}") + if ctx.logger: + ctx.logger.info( + f"[search_chat_history] keywords={deduped_keywords}, " + f"context_window={context_window}, max_results={max_results}" + ) + + search_results = message_summary.search_messages_with_context( + chat_id=chat_id, + keywords=deduped_keywords, + context_window=context_window, + max_groups=max_results ) - search_results = message_summary.search_messages_with_context( - chat_id=chat_id, - keywords=deduped_keywords, - context_window=context_window, - max_groups=max_results - ) + segments = [] + lines_seen = set() + for segment in search_results: + formatted = [] + for line in segment.get("formatted_messages", []): + if line not in lines_seen: + lines_seen.add(line) + formatted.append(line) + if not formatted: + continue + segments.append({ + "matched_keywords": segment.get("matched_keywords", []), + "messages": formatted + }) - response_payload = { - "results": search_results, - "returned_groups": len(search_results), - "keywords": deduped_keywords - } + response_payload = { + "segments": segments, + "returned_groups": len(segments), + "keywords": deduped_keywords + } - print(f"[search_chat_history] returned_groups={len(search_results)}") - if ctx.logger: - ctx.logger.info(f"[search_chat_history] returned_groups={len(search_results)}") + print(f"[search_chat_history] returned_groups={len(segments)}") + if ctx.logger: + ctx.logger.info(f"[search_chat_history] returned_groups={len(segments)}") - if not search_results: - response_payload["notice"] = "No messages matched the provided keywords." + if not segments: + response_payload["notice"] = "No messages matched the provided keywords." + + return json.dumps(response_payload, ensure_ascii=False) + + elif tool_name == "fetch_chat_history_range": + if "start_offset" not in arguments or "end_offset" not in arguments: + return json.dumps({"error": "start_offset and end_offset are required."}, ensure_ascii=False) + + start_offset = arguments.get("start_offset") + end_offset = arguments.get("end_offset") + + try: + start_offset = int(start_offset) + end_offset = int(end_offset) + except (TypeError, ValueError): + return json.dumps({"error": "start_offset and end_offset must be integers."}, ensure_ascii=False) + + if start_offset <= 30 or end_offset <= 30: + return json.dumps({"error": "Offsets must be greater than 30 to avoid visible messages."}, ensure_ascii=False) + + if start_offset > end_offset: + start_offset, end_offset = end_offset, start_offset + + print(f"[fetch_chat_history_range] chat_id={chat_id}, start_offset={start_offset}, " + f"end_offset={end_offset}") + if ctx.logger: + ctx.logger.info( + f"[fetch_chat_history_range] start_offset={start_offset}, " + f"end_offset={end_offset}" + ) + + range_result = message_summary.get_messages_by_reverse_range( + chat_id=chat_id, + start_offset=start_offset, + end_offset=end_offset + ) + + response_payload = { + "start_offset": range_result.get("start_offset"), + "end_offset": range_result.get("end_offset"), + "messages": range_result.get("messages", []), + "returned_count": range_result.get("returned_count", 0), + "total_messages": range_result.get("total_messages", 0) + } + + print(f"[fetch_chat_history_range] returned_count={response_payload['returned_count']}") + if ctx.logger: + ctx.logger.info( + f"[fetch_chat_history_range] returned_count={response_payload['returned_count']}" + ) + + if response_payload["returned_count"] == 0: + response_payload["notice"] = "No messages available in the requested range." + + return json.dumps(response_payload, ensure_ascii=False) + + else: + return json.dumps({"error": f"Unknown tool '{tool_name}'"}, ensure_ascii=False) - return json.dumps(response_payload, ensure_ascii=False) except Exception as tool_exc: if ctx.logger: - ctx.logger.error(f"搜索历史工具调用失败: {tool_exc}", exc_info=True) + ctx.logger.error(f"历史搜索工具调用失败: {tool_exc}", exc_info=True) return json.dumps( - {"error": f"Search failed: {tool_exc.__class__.__name__}"}, + {"error": f"History tool failed: {tool_exc.__class__.__name__}"}, ensure_ascii=False ) - tools = [search_history_tool] + tools = [search_history_tool, range_history_tool] tool_handler = handle_tool_call rsp = chat_model.get_answer( diff --git a/function/func_summary.py b/function/func_summary.py index 1dfbd82..278b1eb 100644 --- a/function/func_summary.py +++ b/function/func_summary.py @@ -240,7 +240,14 @@ class MessageSummary: return messages - def search_messages_with_context(self, chat_id, keywords, context_window=5, max_groups=20): + def search_messages_with_context( + self, + chat_id, + keywords, + context_window=5, + max_groups=20, + exclude_recent=30 + ): """根据关键词搜索消息,返回包含前后上下文的结果 Args: @@ -248,6 +255,7 @@ class MessageSummary: keywords (Union[str, list[str]]): 需要搜索的关键词或关键词列表 context_window (int): 每条匹配消息前后额外提供的消息数量 max_groups (int): 返回的最多结果组数(按时间倒序,优先最新消息) + exclude_recent (int): 跳过最近的若干条消息(默认30条) Returns: list[dict]: 搜索结果列表,每个元素包含匹配关键词、锚点消息及上下文消息 @@ -285,18 +293,32 @@ class MessageSummary: if not messages: return [] + try: + exclude_recent = int(exclude_recent) + except (TypeError, ValueError): + exclude_recent = 30 + exclude_recent = max(0, exclude_recent) + results = [] total_messages = len(messages) + cutoff_index = total_messages - exclude_recent + if cutoff_index <= 0: + return [] + used_indices = set() - for idx in range(total_messages - 1, -1, -1): + for idx in range(cutoff_index - 1, -1, -1): message = messages[idx] content = message.get("content", "") if not content: continue lower_content = content.lower() - matched_keywords = [orig for orig, lower in normalized_keywords if lower in lower_content] + matched_keywords = [ + orig + for orig, lower in normalized_keywords + if lower in lower_content + ] if not matched_keywords: continue @@ -306,9 +328,16 @@ class MessageSummary: start = max(0, idx - context_window) end = min(total_messages, idx + context_window + 1) segment_messages = [] + formatted_lines = [] + seen_lines = set() for pos in range(start, end): msg = messages[pos] + line = f"{msg.get('time')} {msg.get('sender')} {msg.get('content')}" + if line not in seen_lines: + seen_lines.add(line) + formatted_lines.append(line) + segment_messages.append({ "time": msg.get("time"), "sender": msg.get("sender"), @@ -324,7 +353,8 @@ class MessageSummary: "anchor_time": message.get("time"), "anchor_sender": message.get("sender"), "anchor_sender_wxid": message.get("sender_wxid"), - "messages": segment_messages + "messages": segment_messages, + "formatted_messages": formatted_lines }) for off in range(start, end): @@ -335,6 +365,85 @@ class MessageSummary: return results + def get_messages_by_reverse_range( + self, + chat_id, + start_offset, + end_offset, + max_messages_limit=500 + ): + """按倒数范围获取消息 + + Args: + chat_id (str): 聊天ID(群ID或用户ID) + start_offset (int): 离最新消息的起始偏移(倒数 start_offset 条,必须 > 0) + end_offset (int): 离最新消息的结束偏移(倒数 end_offset 条,必须 >= start_offset) + max_messages_limit (int): 内部限制,防止一次返回过多消息 + + Returns: + dict: 包含请求范围信息和格式化消息行 + """ + try: + start_offset = int(start_offset) + end_offset = int(end_offset) + except (TypeError, ValueError): + raise ValueError("start_offset 和 end_offset 必须是整数") + + if start_offset <= 0 or end_offset <= 0: + raise ValueError("start_offset 和 end_offset 必须为正整数") + + if start_offset > end_offset: + start_offset, end_offset = end_offset, start_offset + + try: + max_messages_limit = int(max_messages_limit) + except (TypeError, ValueError): + max_messages_limit = 500 + max_messages_limit = max(1, min(max_messages_limit, 1000)) + + messages = self.get_messages(chat_id) + total_messages = len(messages) + if total_messages == 0: + return { + "start_offset": start_offset, + "end_offset": end_offset, + "messages": [], + "returned_count": 0, + "total_messages": 0 + } + + start_offset = min(start_offset, total_messages) + end_offset = min(end_offset, total_messages) + + start_index = max(total_messages - end_offset, 0) + end_index = min(total_messages - start_offset, total_messages - 1) + + if end_index < start_index: + return { + "start_offset": start_offset, + "end_offset": end_offset, + "messages": [], + "returned_count": 0, + "total_messages": total_messages + } + + selected = messages[start_index:end_index + 1] + if len(selected) > max_messages_limit: + selected = selected[-max_messages_limit:] + + formatted_lines = [ + f"{msg.get('time')} {msg.get('sender')} {msg.get('content')}" + for msg in selected + ] + + return { + "start_offset": start_offset, + "end_offset": end_offset, + "messages": formatted_lines, + "returned_count": len(formatted_lines), + "total_messages": total_messages + } + def _basic_summarize(self, messages): """基本的消息总结逻辑,不使用AI @@ -354,77 +463,6 @@ class MessageSummary: return "\n".join(res) - def _ai_summarize(self, messages, chat_model, chat_id): - """使用AI模型生成消息总结 - - Args: - messages: 消息列表 (格式同 get_messages 返回值) - chat_model: AI聊天模型对象 - chat_id: 聊天ID - - Returns: - str: 消息总结 - """ - if not messages: - return "没有可以总结的历史消息。" - - formatted_msgs = [] - for msg in messages: - # 使用新的时间格式和发送者 - formatted_msgs.append(f"[{msg['time']}]{msg['sender']}: {msg['content']}") - - # 构建提示词 ... (保持不变) - prompt = ( - "你是泡泡,请仔细阅读并分析以下聊天记录,生成一简要的、结构清晰且抓住重点的摘要。\n\n" - "摘要格式要求:\n" - "1. 使用数字编号列表 (例如 1., 2., 3.) 来组织内容,每个编号代表一个独立的主要讨论主题,不要超过3个主题。\n" - "2. 在每个编号的主题下,写成一段不带格式的文字,每个主题单独成段并空行,需包含以下内容:\n" - " - 这个讨论的核心的简要描述。\n" - " - 该讨论的关键成员 (用括号 [用户名] 格式) 和他们的关键发言内容、成员之间的关键互动。\n" - " - 该讨论的讨论结果。\n" - "3. 总结需客观、精炼、简短精悍,直接呈现最核心且精简的事实,尽量不要添加额外的评论或分析,不要总结有关自己的事情。\n" - "4. 不要暴露出格式,不要说核心是xxx参与者是xxx结果是xxx,自然一点。\n\n" - "聊天记录如下:\n" + "\n".join(formatted_msgs) - ) - - try: - # 调用AI部分保持不变,但现在AI模型内部应使用数据库历史记录 - # 确保调用 get_answer 时,AI模型实例已经关联了 MessageSummary - summary = chat_model.get_answer(prompt, f"summary_{chat_id}") # 使用特殊 wxid 避免冲突 - - - if not summary: - return self._basic_summarize(messages) - - return summary - except Exception as e: - self.LOG.error(f"使用AI生成总结失败: {e}") - return self._basic_summarize(messages) - - def summarize_messages(self, chat_id, chat_model=None): - """生成消息总结 - - Args: - chat_id: 聊天ID(群ID或用户ID) - chat_model: AI聊天模型对象,如果为None则使用基础总结 - - Returns: - str: 消息总结 - """ - messages = self.get_messages(chat_id) - if not messages: - return "没有可以总结的历史消息。" - - if chat_model: - # 检查 chat_model 是否具有 get_answer 方法并且已经初始化了 message_summary - if hasattr(chat_model, 'get_answer') and hasattr(chat_model, 'message_summary') and chat_model.message_summary: - return self._ai_summarize(messages, chat_model, chat_id) - else: - self.LOG.warning(f"提供的 chat_model ({type(chat_model)}) 不支持基于数据库历史的总结或未正确初始化。将使用基础总结。") - return self._basic_summarize(messages) - else: - return self._basic_summarize(messages) - def process_message_from_wxmsg(self, msg, wcf, all_contacts, bot_wxid=None): """从微信消息对象中处理并记录与总结相关的文本消息 记录所有群聊和私聊的文本(1)和App/卡片(49)消息。