From 05a33042c86415f170ba49039aaa4716f6b8e5ab Mon Sep 17 00:00:00 2001 From: SgtPepper114 <1992117008@qq.com> Date: Mon, 16 Feb 2026 13:26:57 +0000 Subject: [PATCH] fix(gemini): support dingtalk image markers as multimodal input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - parse [图片: path] markers in text and convert to Gemini inlineData parts - unify reply path via call_with_tools to reuse multimodal conversion - keep legacy safety behavior (BLOCK_NONE) and restore safety ratings logging on empty response - add multimodal request image-part count log for debugging --- models/gemini/google_gemini_bot.py | 228 ++++++++++++++++++++++++----- 1 file changed, 190 insertions(+), 38 deletions(-) diff --git a/models/gemini/google_gemini_bot.py b/models/gemini/google_gemini_bot.py index 95d4b68..ca73a2a 100644 --- a/models/gemini/google_gemini_bot.py +++ b/models/gemini/google_gemini_bot.py @@ -6,11 +6,14 @@ Google gemini bot """ # encoding:utf-8 +import base64 import json +import mimetypes +import os +import re import time import requests from models.bot import Bot -import google.generativeai as genai from models.session_manager import SessionManager from bridge.context import ContextType, Context from bridge.reply import Reply, ReplyType @@ -18,7 +21,6 @@ from common.log import logger from config import conf from models.chatgpt.chat_gpt_session import ChatGPTSession from models.baidu.baidu_wenxin_session import BaiduWenxinSession -from google.generativeai.types import HarmCategory, HarmBlockThreshold # OpenAI对话模型API (可用) @@ -43,6 +45,7 @@ class GoogleGeminiBot(Bot): self.api_base = "https://generativelanguage.googleapis.com" def reply(self, query, context: Context = None) -> Reply: + session_id = None try: if context.type != ContextType.TEXT: logger.warn(f"[Gemini] Unsupported message type, type={context.type}") @@ -50,43 +53,47 @@ class GoogleGeminiBot(Bot): logger.info(f"[Gemini] query={query}") session_id = context["session_id"] session = self.sessions.session_query(query, session_id) - gemini_messages = self._convert_to_gemini_messages(self.filter_messages(session.messages)) - logger.debug(f"[Gemini] messages={gemini_messages}") - genai.configure(api_key=self.api_key) - model = genai.GenerativeModel(self.model) - - # 添加安全设置 - safety_settings = { - HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, - } - - # 生成回复,包含安全设置 - response = model.generate_content( - gemini_messages, - safety_settings=safety_settings + filtered_messages = self.filter_messages(session.messages) + logger.debug(f"[Gemini] messages={filtered_messages}") + + response = self.call_with_tools( + messages=filtered_messages, + tools=None, + stream=False, + model=self.model ) - if response.candidates and response.candidates[0].content: - reply_text = response.candidates[0].content.parts[0].text - logger.info(f"[Gemini] reply={reply_text}") - self.sessions.session_reply(reply_text, session_id) - return Reply(ReplyType.TEXT, reply_text) - else: - # 没有有效响应内容,可能内容被屏蔽,输出安全评分 - logger.warning("[Gemini] No valid response generated. Checking safety ratings.") - if hasattr(response, 'candidates') and response.candidates: - for rating in response.candidates[0].safety_ratings: - logger.warning(f"Safety rating: {rating.category} - {rating.probability}") - error_message = "No valid response generated due to safety constraints." + + if isinstance(response, dict) and response.get("error"): + error_message = response.get("message", "Failed to invoke [Gemini] api!") + logger.error(f"[Gemini] API error: {error_message}") self.sessions.session_reply(error_message, session_id) return Reply(ReplyType.ERROR, error_message) + + choices = response.get("choices", []) if isinstance(response, dict) else [] + if choices and choices[0].get("message"): + reply_text = choices[0]["message"].get("content") + if reply_text: + logger.info(f"[Gemini] reply={reply_text}") + self.sessions.session_reply(reply_text, session_id) + return Reply(ReplyType.TEXT, reply_text) + + logger.warning("[Gemini] No valid response generated. Checking safety ratings.") + safety_ratings = response.get("safety_ratings", []) if isinstance(response, dict) else [] + if safety_ratings: + for rating in safety_ratings: + category = rating.get("category", "UNKNOWN") + probability = rating.get("probability", "UNKNOWN") + logger.warning(f"[Gemini] Safety rating: {category} - {probability}") + + error_message = "No valid response generated due to safety constraints." + self.sessions.session_reply(error_message, session_id) + return Reply(ReplyType.ERROR, error_message) except Exception as e: logger.error(f"[Gemini] Error generating response: {str(e)}", exc_info=True) error_message = "Failed to invoke [Gemini] api!" - self.sessions.session_reply(error_message, session_id) + if session_id: + self.sessions.session_reply(error_message, session_id) return Reply(ReplyType.ERROR, error_message) def _convert_to_gemini_messages(self, messages: list): @@ -127,6 +134,93 @@ class GoogleGeminiBot(Bot): turn = "user" return res + @staticmethod + def _extract_image_paths_from_text(content: str): + if not isinstance(content, str): + return "", [] + pattern = r"\[图片:\s*([^\]]+)\]" + image_paths = [m.strip().strip("'\"") for m in re.findall(pattern, content) if m.strip()] + cleaned_text = re.sub(pattern, "", content) + cleaned_text = re.sub(r"\n{3,}", "\n\n", cleaned_text).strip() + return cleaned_text, image_paths + + @staticmethod + def _build_image_inline_part(image_path: str): + if not image_path: + return None + try: + if image_path.startswith("file://"): + image_path = image_path[7:] + + image_path = os.path.expanduser(image_path) + if not os.path.exists(image_path): + logger.warning(f"[Gemini] Image file not found: {image_path}") + return None + + with open(image_path, "rb") as f: + image_bytes = f.read() + + mime_type = mimetypes.guess_type(image_path)[0] or "image/png" + if not mime_type.startswith("image/"): + mime_type = "image/png" + + return { + "inlineData": { + "mimeType": mime_type, + "data": base64.b64encode(image_bytes).decode("utf-8") + } + } + except Exception as e: + logger.warning(f"[Gemini] Failed to build inline image part from path={image_path}, err={e}") + return None + + @staticmethod + def _build_inline_part_from_image_url(image_url): + if not image_url: + return None + + if isinstance(image_url, dict): + image_url = image_url.get("url") + if not image_url or not isinstance(image_url, str): + return None + + if image_url.startswith("data:"): + match = re.match(r"^data:([^;]+);base64,(.+)$", image_url, re.DOTALL) + if not match: + logger.warning("[Gemini] Invalid data URL for image block") + return None + return { + "inlineData": { + "mimeType": match.group(1), + "data": match.group(2).strip() + } + } + + if image_url.startswith("file://") or os.path.exists(os.path.expanduser(image_url)): + return GoogleGeminiBot._build_image_inline_part(image_url) + + if image_url.startswith("http://") or image_url.startswith("https://"): + try: + response = requests.get(image_url, timeout=20) + if response.status_code != 200: + logger.warning(f"[Gemini] Failed to fetch remote image: status={response.status_code}, url={image_url}") + return None + mime_type = response.headers.get("Content-Type", "image/png").split(";")[0].strip() + if not mime_type.startswith("image/"): + mime_type = "image/png" + return { + "inlineData": { + "mimeType": mime_type, + "data": base64.b64encode(response.content).decode("utf-8") + } + } + except Exception as e: + logger.warning(f"[Gemini] Failed to download remote image: url={image_url}, err={e}") + return None + + logger.warning(f"[Gemini] Unsupported image URL format: {image_url[:120]}") + return None + def call_with_tools(self, messages, tools=None, stream=False, **kwargs): """ Call Gemini API with tool support using REST API (following official docs) @@ -145,6 +239,15 @@ class GoogleGeminiBot(Bot): # Build REST API payload payload = {"contents": []} + inline_image_count = 0 + + # Keep legacy behavior: disable Gemini safety blocking like old SDK path. + payload["safetySettings"] = [ + {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"}, + {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"}, + {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"}, + {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"}, + ] # Extract and set system instruction system_prompt = kwargs.get("system", "") @@ -174,8 +277,19 @@ class GoogleGeminiBot(Bot): parts = [] if isinstance(content, str): - # Simple text content - parts.append({"text": content}) + # Text with optional [图片: /path/to/file] markers + cleaned_text, image_paths = self._extract_image_paths_from_text(content) + if cleaned_text: + parts.append({"text": cleaned_text}) + image_added = False + for image_path in image_paths: + image_part = self._build_image_inline_part(image_path) + if image_part: + parts.append(image_part) + image_added = True + inline_image_count += 1 + if not cleaned_text and not image_added and content: + parts.append({"text": content}) elif isinstance(content, list): # List of content blocks (Claude format) @@ -188,8 +302,39 @@ class GoogleGeminiBot(Bot): block_type = block.get("type") if block_type == "text": - # Text block - parts.append({"text": block.get("text", "")}) + # Text block with optional image markers + block_text = block.get("text", "") + cleaned_text, image_paths = self._extract_image_paths_from_text(block_text) + if cleaned_text: + parts.append({"text": cleaned_text}) + for image_path in image_paths: + image_part = self._build_image_inline_part(image_path) + if image_part: + parts.append(image_part) + + elif block_type in ["image", "image_url"]: + # OpenAI format: {"type":"image_url","image_url":{"url":"..."}} + # Claude format: {"type":"image","source":{"type":"base64","media_type":"...","data":"..."}} + image_part = None + if block_type == "image": + source = block.get("source", {}) + if isinstance(source, dict) and source.get("type") == "base64" and source.get("data"): + image_part = { + "inlineData": { + "mimeType": source.get("media_type", "image/png"), + "data": source.get("data") + } + } + elif block.get("image_url"): + image_part = self._build_inline_part_from_image_url(block.get("image_url")) + else: + image_part = self._build_inline_part_from_image_url(block.get("image_url")) + + if image_part: + parts.append(image_part) + inline_image_count += 1 + else: + logger.warning(f"[Gemini] Skip invalid image block: {str(block)[:200]}") elif block_type == "tool_result": # Convert Claude tool_result to Gemini functionResponse @@ -237,6 +382,9 @@ class GoogleGeminiBot(Bot): "role": gemini_role, "parts": parts }) + + if inline_image_count > 0: + logger.info(f"[Gemini] Multimodal request includes {inline_image_count} image part(s)") # Generation config gen_config = {} @@ -363,15 +511,18 @@ class GoogleGeminiBot(Bot): candidates = data.get("candidates", []) if not candidates: logger.warning("[Gemini] No candidates in response") + prompt_feedback = data.get("promptFeedback", {}) return { "error": True, "message": "No candidates in response", - "status_code": 500 + "status_code": 500, + "safety_ratings": prompt_feedback.get("safetyRatings", []) } candidate = candidates[0] content = candidate.get("content", {}) parts = content.get("parts", []) + safety_ratings = candidate.get("safetyRatings", []) logger.debug(f"[Gemini] Candidate parts count: {len(parts)}") @@ -419,7 +570,8 @@ class GoogleGeminiBot(Bot): "message": message_dict, "finish_reason": "tool_calls" if tool_calls else "stop" }], - "usage": data.get("usageMetadata", {}) + "usage": data.get("usageMetadata", {}), + "safety_ratings": safety_ratings } except Exception as e: