diff --git a/README.md b/README.md index bd0b4c6..0e81067 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ graph TD
  • Claude-3.7:Anthropic的高级理解与解释
  • DeepSeek-v3/r1:专为中文场景优化的模型
  • QVQ-MAX/Qwen-VL-MAX:以视觉推理闻名的国产AI
  • -
  • Gemini-2.5-Pro/2.0-flash:智商130的非推理AI
  • +
  • Gemini-2.5-Pro/2.5-flash:智商130的非推理AI
  • @@ -189,7 +189,7 @@ python app.py | **QVQ-MAX** | 多模态支持,推理支持 | 复杂问题,视觉分析 | | **Qwen-VL-MAX** | 多模态支持 | 简单问题,视觉分析 | | **Gemini-2.5-Pro** | 多模态支持 | 复杂问题,视觉分析 | -| **Gemini-2.0-Flash** | 多模态支持 | 简单问题,视觉分析 | +| **Gemini-2.5-Flash** | 多模态支持 | 简单问题,视觉分析 | ### 🛠️ 可调参数 @@ -247,4 +247,4 @@ python app.py ## 📜 开源协议 -本项目采用 [Apache 2.0](LICENSE) 协议。 +本项目采用 [Apache 2.5](LICENSE) 协议。 diff --git a/app.py b/app.py index 73cf796..0160584 100644 --- a/app.py +++ b/app.py @@ -101,6 +101,8 @@ def create_model_instance(model_id, settings, is_reasoning=False): api_key_id = "AlibabaApiKey" elif "gemini" in model_id.lower() or "google" in model_id.lower(): api_key_id = "GoogleApiKey" + elif "doubao" in model_id.lower(): + api_key_id = "DoubaoApiKey" # 首先尝试从本地配置获取API密钥 api_key = get_api_key(api_key_id) @@ -156,6 +158,10 @@ def create_model_instance(model_id, settings, is_reasoning=False): custom_base_url = api_base_urls.get('google') if custom_base_url: base_url = custom_base_url + elif "doubao" in model_id.lower(): + custom_base_url = api_base_urls.get('doubao') + if custom_base_url: + base_url = custom_base_url # 创建模型实例 model_instance = ModelFactory.create_model( @@ -318,39 +324,66 @@ def handle_text_extraction(data): if not isinstance(settings, dict): raise ValueError("Invalid settings format") - # 尝试从本地配置获取Mathpix API密钥 - mathpix_app_id = get_api_key('MathpixAppId') - mathpix_app_key = get_api_key('MathpixAppKey') + # 优先使用百度OCR,如果没有配置则使用Mathpix + # 首先尝试获取百度OCR API密钥 + baidu_api_key = get_api_key('BaiduApiKey') + baidu_secret_key = get_api_key('BaiduSecretKey') - # 构建完整的Mathpix API密钥(格式:app_id:app_key) - mathpix_key = f"{mathpix_app_id}:{mathpix_app_key}" if mathpix_app_id and mathpix_app_key else None + # 构建百度OCR API密钥(格式:api_key:secret_key) + ocr_key = None + ocr_model = None - # 如果本地没有配置,尝试使用前端传递的密钥(向后兼容) - if not mathpix_key: - mathpix_key = settings.get('mathpixApiKey') + if baidu_api_key and baidu_secret_key: + ocr_key = f"{baidu_api_key}:{baidu_secret_key}" + ocr_model = 'baidu-ocr' + print("Using Baidu OCR for text extraction...") + else: + # 回退到Mathpix + mathpix_app_id = get_api_key('MathpixAppId') + mathpix_app_key = get_api_key('MathpixAppKey') + + # 构建完整的Mathpix API密钥(格式:app_id:app_key) + mathpix_key = f"{mathpix_app_id}:{mathpix_app_key}" if mathpix_app_id and mathpix_app_key else None + + # 如果本地没有配置,尝试使用前端传递的密钥(向后兼容) + if not mathpix_key: + mathpix_key = settings.get('mathpixApiKey') + + if mathpix_key: + ocr_key = mathpix_key + ocr_model = 'mathpix' + print("Using Mathpix OCR for text extraction...") - if not mathpix_key: - raise ValueError("Mathpix API key is required") + if not ocr_key: + raise ValueError("OCR API key is required. Please configure Baidu OCR (API Key + Secret Key) or Mathpix (App ID + App Key)") # 先回复客户端,确认已收到请求,防止超时断开 # 注意:这里不能使用return,否则后续代码不会执行 socketio.emit('request_acknowledged', { 'status': 'received', - 'message': 'Image received, text extraction in progress' + 'message': f'Image received, text extraction in progress using {ocr_model}' }, room=request.sid) try: - app_id, app_key = mathpix_key.split(':') - if not app_id.strip() or not app_key.strip(): - raise ValueError() + if ocr_model == 'baidu-ocr': + api_key, secret_key = ocr_key.split(':') + if not api_key.strip() or not secret_key.strip(): + raise ValueError() + elif ocr_model == 'mathpix': + app_id, app_key = ocr_key.split(':') + if not app_id.strip() or not app_key.strip(): + raise ValueError() except ValueError: - raise ValueError("Invalid Mathpix API key format. Expected format: 'app_id:app_key'") + if ocr_model == 'baidu-ocr': + raise ValueError("Invalid Baidu OCR API key format. Expected format: 'API_KEY:SECRET_KEY'") + else: + raise ValueError("Invalid Mathpix API key format. Expected format: 'app_id:app_key'") - print("Creating Mathpix model instance...") - # 只传递必需的参数,ModelFactory.create_model会处理不同模型类型 + print(f"Creating {ocr_model} model instance...") + # ModelFactory.create_model会处理不同模型类型 model = ModelFactory.create_model( - model_name='mathpix', - api_key=mathpix_key + model_name=ocr_model, + api_key=ocr_key ) print("Starting text extraction...") diff --git a/config/api_base_urls.json b/config/api_base_urls.json index f5136e7..192c0b7 100644 --- a/config/api_base_urls.json +++ b/config/api_base_urls.json @@ -3,5 +3,6 @@ "OpenaiApiBaseUrl": "", "DeepseekApiBaseUrl": "", "AlibabaApiBaseUrl": "", - "GoogleApiBaseUrl": "" + "GoogleApiBaseUrl": "", + "DoubaoApiBaseUrl": "" } \ No newline at end of file diff --git a/config/models.json b/config/models.json index bdd2481..fe35fd8 100644 --- a/config/models.json +++ b/config/models.json @@ -24,6 +24,11 @@ "name": "Google", "api_key_id": "GoogleApiKey", "class_name": "GoogleModel" + }, + "doubao": { + "name": "Doubao", + "api_key_id": "DoubaoApiKey", + "class_name": "DoubaoModel" } }, "models": { @@ -91,21 +96,29 @@ "version": "latest", "description": "阿里通义千问VL-MAX模型,视觉理解能力最强,支持图像理解和复杂任务" }, - "gemini-2.5-pro-preview-03-25": { + "gemini-2.5-pro": { "name": "Gemini 2.5 Pro", "provider": "google", "supportsMultimodal": true, "isReasoning": true, - "version": "preview-03-25", - "description": "Google最强大的Gemini 2.5 Pro模型,支持图像理解" + "version": "latest", + "description": "Google最强大的Gemini 2.5 Pro模型,支持图像理解(需要付费API密钥)" }, - "gemini-2.0-flash": { - "name": "Gemini 2.0 Flash", + "gemini-2.5-flash": { + "name": "Gemini 2.5 Flash", "provider": "google", "supportsMultimodal": true, "isReasoning": false, "version": "latest", - "description": "Google更快速的Gemini 2.0 Flash模型,支持图像理解,响应更迅速" + "description": "Google更快速的Gemini 2.5 Flash模型,支持图像理解,有免费配额" + }, + "doubao-seed-1-6-250615": { + "name": "Doubao-Seed-1.6", + "provider": "doubao", + "supportsMultimodal": true, + "isReasoning": true, + "version": "latest", + "description": "支持auto/thinking/non-thinking三种思考模式、支持多模态、256K长上下文" } } } \ No newline at end of file diff --git a/config/proxy_api.json b/config/proxy_api.json index 0b26cb6..98e2832 100644 --- a/config/proxy_api.json +++ b/config/proxy_api.json @@ -4,7 +4,8 @@ "anthropic": "", "deepseek": "", "google": "", - "openai": "" + "openai": "", + "doubao": "" }, "enabled": true } \ No newline at end of file diff --git a/models/__init__.py b/models/__init__.py index 336f66e..43783ce 100644 --- a/models/__init__.py +++ b/models/__init__.py @@ -4,6 +4,7 @@ from .openai import OpenAIModel from .deepseek import DeepSeekModel from .alibaba import AlibabaModel from .google import GoogleModel +from .doubao import DoubaoModel from .factory import ModelFactory __all__ = [ @@ -13,5 +14,6 @@ __all__ = [ 'DeepSeekModel', 'AlibabaModel', 'GoogleModel', + 'DoubaoModel', 'ModelFactory' ] diff --git a/models/alibaba.py b/models/alibaba.py index e6b3076..4402b3c 100644 --- a/models/alibaba.py +++ b/models/alibaba.py @@ -4,12 +4,13 @@ from openai import OpenAI from .base import BaseModel class AlibabaModel(BaseModel): - def __init__(self, api_key: str, temperature: float = 0.7, system_prompt: str = None, language: str = None, model_name: str = None): + def __init__(self, api_key: str, temperature: float = 0.7, system_prompt: str = None, language: str = None, model_name: str = None, api_base_url: str = None): # 如果没有提供模型名称,才使用默认值 self.model_name = model_name if model_name else "QVQ-Max-2025-03-25" print(f"初始化阿里巴巴模型: {self.model_name}") # 在super().__init__之前设置model_name,这样get_default_system_prompt能使用它 super().__init__(api_key, temperature, system_prompt, language) + self.api_base_url = api_base_url # 存储API基础URL def get_default_system_prompt(self) -> str: """根据模型名称返回不同的默认系统提示词""" diff --git a/models/baidu_ocr.py b/models/baidu_ocr.py new file mode 100644 index 0000000..ac48057 --- /dev/null +++ b/models/baidu_ocr.py @@ -0,0 +1,177 @@ +import base64 +import json +import time +import urllib.request +import urllib.parse +from typing import Generator, Dict, Any +from .base import BaseModel + +class BaiduOCRModel(BaseModel): + """ + 百度OCR模型,用于图像文字识别 + """ + + def __init__(self, api_key: str, secret_key: str = None, temperature: float = 0.7, system_prompt: str = None): + """ + 初始化百度OCR模型 + + Args: + api_key: 百度API Key + secret_key: 百度Secret Key(可以在api_key中用冒号分隔传入) + temperature: 不用于OCR但保持BaseModel兼容性 + system_prompt: 不用于OCR但保持BaseModel兼容性 + + Raises: + ValueError: 如果API密钥格式无效 + """ + super().__init__(api_key, temperature, system_prompt) + + # 支持两种格式:单独传递或在api_key中用冒号分隔 + if secret_key: + self.api_key = api_key + self.secret_key = secret_key + else: + try: + self.api_key, self.secret_key = api_key.split(':') + except ValueError: + raise ValueError("百度OCR API密钥必须是 'API_KEY:SECRET_KEY' 格式或单独传递secret_key参数") + + # 百度API URLs + self.token_url = "https://aip.baidubce.com/oauth/2.0/token" + self.ocr_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic" + + # 缓存access_token + self._access_token = None + self._token_expires = 0 + + def get_access_token(self) -> str: + """获取百度API的access_token""" + # 检查是否需要刷新token(提前5分钟刷新) + if self._access_token and time.time() < self._token_expires - 300: + return self._access_token + + # 请求新的access_token + params = { + 'grant_type': 'client_credentials', + 'client_id': self.api_key, + 'client_secret': self.secret_key + } + + data = urllib.parse.urlencode(params).encode('utf-8') + request = urllib.request.Request(self.token_url, data=data) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + + try: + with urllib.request.urlopen(request) as response: + result = json.loads(response.read().decode('utf-8')) + + if 'access_token' in result: + self._access_token = result['access_token'] + # 设置过期时间(默认30天,但我们提前刷新) + self._token_expires = time.time() + result.get('expires_in', 2592000) + return self._access_token + else: + raise Exception(f"获取access_token失败: {result.get('error_description', '未知错误')}") + + except Exception as e: + raise Exception(f"请求access_token失败: {str(e)}") + + def ocr_image(self, image_data: str) -> str: + """ + 对图像进行OCR识别 + + Args: + image_data: Base64编码的图像数据 + + Returns: + str: 识别出的文字内容 + """ + access_token = self.get_access_token() + + # 准备请求数据 + params = { + 'image': image_data, + 'language_type': 'auto_detect', # 自动检测语言 + 'detect_direction': 'true', # 检测图像朝向 + 'probability': 'false' # 不返回置信度(减少响应大小) + } + + data = urllib.parse.urlencode(params).encode('utf-8') + url = f"{self.ocr_url}?access_token={access_token}" + + request = urllib.request.Request(url, data=data) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + + try: + with urllib.request.urlopen(request) as response: + result = json.loads(response.read().decode('utf-8')) + + if 'error_code' in result: + raise Exception(f"百度OCR API错误: {result.get('error_msg', '未知错误')}") + + # 提取识别的文字 + words_result = result.get('words_result', []) + text_lines = [item['words'] for item in words_result] + + return '\n'.join(text_lines) + + except Exception as e: + raise Exception(f"OCR识别失败: {str(e)}") + + def extract_full_text(self, image_data: str) -> str: + """ + 提取图像中的完整文本(与Mathpix兼容的接口) + + Args: + image_data: Base64编码的图像数据 + + Returns: + str: 提取的文本内容 + """ + return self.ocr_image(image_data) + + def analyze_image(self, image_data: str, proxies: dict = None) -> Generator[Dict[str, Any], None, None]: + """ + 分析图像并返回OCR结果(流式输出以保持接口一致性) + + Args: + image_data: Base64编码的图像数据 + proxies: 代理配置(未使用) + + Yields: + dict: 包含OCR结果的响应 + """ + try: + text = self.ocr_image(image_data) + yield { + 'status': 'completed', + 'content': text, + 'model': 'baidu-ocr' + } + except Exception as e: + yield { + 'status': 'error', + 'content': f'OCR识别失败: {str(e)}', + 'model': 'baidu-ocr' + } + + def analyze_text(self, text: str, proxies: dict = None) -> Generator[Dict[str, Any], None, None]: + """ + 分析文本(OCR模型不支持文本分析) + + Args: + text: 输入文本 + proxies: 代理配置(未使用) + + Yields: + dict: 错误响应 + """ + yield { + 'status': 'error', + 'content': 'OCR模型不支持文本分析功能', + 'model': 'baidu-ocr' + } + + def get_model_identifier(self) -> str: + """返回模型标识符""" + return "baidu-ocr" diff --git a/models/deepseek.py b/models/deepseek.py index 4c314db..f324734 100644 --- a/models/deepseek.py +++ b/models/deepseek.py @@ -6,9 +6,10 @@ from openai import OpenAI from .base import BaseModel class DeepSeekModel(BaseModel): - def __init__(self, api_key: str, temperature: float = 0.7, system_prompt: str = None, language: str = None, model_name: str = "deepseek-reasoner"): + def __init__(self, api_key: str, temperature: float = 0.7, system_prompt: str = None, language: str = None, model_name: str = "deepseek-reasoner", api_base_url: str = None): super().__init__(api_key, temperature, system_prompt, language) self.model_name = model_name + self.api_base_url = api_base_url # 存储API基础URL def get_default_system_prompt(self) -> str: return """You are an expert at analyzing questions and providing detailed solutions. When presented with an image of a question: diff --git a/models/doubao.py b/models/doubao.py new file mode 100644 index 0000000..68d7925 --- /dev/null +++ b/models/doubao.py @@ -0,0 +1,339 @@ +import json +import os +import base64 +from typing import Generator, Dict, Any, Optional +import requests +from .base import BaseModel + +class DoubaoModel(BaseModel): + """ + 豆包API模型实现类 + 支持字节跳动的豆包AI模型,可处理文本和图像输入 + """ + + def __init__(self, api_key: str, temperature: float = 0.7, system_prompt: str = None, language: str = None, model_name: str = None, api_base_url: str = None): + """ + 初始化豆包模型 + + Args: + api_key: 豆包API密钥 + temperature: 生成温度 + system_prompt: 系统提示词 + language: 首选语言 + model_name: 指定具体模型名称,如不指定则使用默认值 + api_base_url: API基础URL,用于设置自定义API端点 + """ + super().__init__(api_key, temperature, system_prompt, language) + self.model_name = model_name or self.get_model_identifier() + self.base_url = api_base_url or "https://ark.cn-beijing.volces.com/api/v3" + self.max_tokens = 4096 # 默认最大输出token数 + self.reasoning_config = None # 推理配置,类似于AnthropicModel + + def get_default_system_prompt(self) -> str: + return """你是一个专业的问题分析专家。当看到问题图片时: +1. 仔细阅读并理解问题 +2. 分解问题的关键组成部分 +3. 提供清晰的分步解决方案 +4. 如果相关,解释涉及的概念或理论 +5. 如果有多种方法,优先解释最有效的方法""" + + def get_model_identifier(self) -> str: + """返回默认的模型标识符""" + return "doubao-seed-1-6-250615" # Doubao-Seed-1.6 + + def get_actual_model_name(self) -> str: + """根据配置的模型名称返回实际的API调用标识符""" + # 豆包API的实际模型名称映射 + model_mapping = { + "doubao-seed-1-6-250615": "doubao-seed-1-6-250615" + } + + return model_mapping.get(self.model_name, "doubao-seed-1-6-250615") + + def analyze_text(self, text: str, proxies: dict = None) -> Generator[dict, None, None]: + """流式生成文本响应""" + try: + yield {"status": "started"} + + # 设置环境变量代理(如果提供) + original_proxies = None + if proxies: + original_proxies = { + 'http_proxy': os.environ.get('http_proxy'), + 'https_proxy': os.environ.get('https_proxy') + } + if 'http' in proxies: + os.environ['http_proxy'] = proxies['http'] + if 'https' in proxies: + os.environ['https_proxy'] = proxies['https'] + + try: + # 构建请求头 + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + # 构建消息 - 根据官方API文档,暂时不使用系统提示词 + messages = [] + + # 添加用户查询 + user_content = text + if self.language and self.language != 'auto': + user_content = f"请使用{self.language}回答以下问题: {text}" + + messages.append({ + "role": "user", + "content": user_content + }) + + # 处理推理配置 + thinking = { + "type": "auto" # 默认值 + } + + if hasattr(self, 'reasoning_config') and self.reasoning_config: + # 从reasoning_config中获取thinking_mode + thinking_mode = self.reasoning_config.get('thinking_mode', "auto") + thinking = { + "type": thinking_mode + } + + # 构建请求数据 + data = { + "model": self.get_actual_model_name(), + "messages": messages, + "thinking": thinking, + "temperature": self.temperature, + "max_tokens": self.max_tokens, + "stream": True + } + + # 发送流式请求 + response = requests.post( + f"{self.base_url}/chat/completions", + headers=headers, + json=data, + stream=True, + proxies=proxies if proxies else None, + timeout=60 + ) + + if response.status_code != 200: + error_text = response.text + raise Exception(f"HTTP {response.status_code}: {error_text}") + + response.raise_for_status() + + # 初始化响应缓冲区 + response_buffer = "" + + # 处理流式响应 + for line in response.iter_lines(): + if not line: + continue + + line = line.decode('utf-8') + if not line.startswith('data: '): + continue + + line = line[6:] # 移除 'data: ' 前缀 + + if line == '[DONE]': + break + + try: + chunk_data = json.loads(line) + choices = chunk_data.get('choices', []) + + if choices and len(choices) > 0: + delta = choices[0].get('delta', {}) + content = delta.get('content', '') + + if content: + response_buffer += content + + # 发送响应进度 + yield { + "status": "streaming", + "content": response_buffer + } + + except json.JSONDecodeError: + continue + + # 确保发送完整的最终内容 + yield { + "status": "completed", + "content": response_buffer + } + + finally: + # 恢复原始代理设置 + if original_proxies: + for key, value in original_proxies.items(): + if value is None: + if key in os.environ: + del os.environ[key] + else: + os.environ[key] = value + + except Exception as e: + yield { + "status": "error", + "error": f"豆包API错误: {str(e)}" + } + + def analyze_image(self, image_data: str, proxies: dict = None) -> Generator[dict, None, None]: + """分析图像并流式生成响应""" + try: + yield {"status": "started"} + + # 设置环境变量代理(如果提供) + original_proxies = None + if proxies: + original_proxies = { + 'http_proxy': os.environ.get('http_proxy'), + 'https_proxy': os.environ.get('https_proxy') + } + if 'http' in proxies: + os.environ['http_proxy'] = proxies['http'] + if 'https' in proxies: + os.environ['https_proxy'] = proxies['https'] + + try: + # 构建请求头 + headers = { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json" + } + + # 处理图像数据 + if image_data.startswith('data:image'): + # 如果是data URI,提取base64部分 + image_data = image_data.split(',', 1)[1] + + # 构建用户消息 - 使用豆包API官方示例格式 + # 首先检查图像数据的格式,确保是有效的图像 + image_format = "jpeg" # 默认使用jpeg + if image_data.startswith('/9j/'): # JPEG magic number in base64 + image_format = "jpeg" + elif image_data.startswith('iVBORw0KGgo'): # PNG magic number in base64 + image_format = "png" + + user_content = [ + { + "type": "text", + "text": f"请使用{self.language}分析这张图片并提供详细解答。" if self.language and self.language != 'auto' else "请分析这张图片并提供详细解答?" + }, + { + "type": "image_url", + "image_url": { + "url": f"data:image/{image_format};base64,{image_data}" + } + } + ] + + messages = [ + { + "role": "user", + "content": user_content + } + ] + + # 处理推理配置 + thinking = { + "type": "auto" # 默认值 + } + + if hasattr(self, 'reasoning_config') and self.reasoning_config: + # 从reasoning_config中获取thinking_mode + thinking_mode = self.reasoning_config.get('thinking_mode', "auto") + thinking = { + "type": thinking_mode + } + + # 构建请求数据 + data = { + "model": self.get_actual_model_name(), + "messages": messages, + "thinking": thinking, + "temperature": self.temperature, + "max_tokens": self.max_tokens, + "stream": True + } + + # 发送流式请求 + response = requests.post( + f"{self.base_url}/chat/completions", + headers=headers, + json=data, + stream=True, + proxies=proxies if proxies else None, + timeout=60 + ) + + if response.status_code != 200: + error_text = response.text + raise Exception(f"HTTP {response.status_code}: {error_text}") + + response.raise_for_status() + + # 初始化响应缓冲区 + response_buffer = "" + + # 处理流式响应 + for line in response.iter_lines(): + if not line: + continue + + line = line.decode('utf-8') + if not line.startswith('data: '): + continue + + line = line[6:] # 移除 'data: ' 前缀 + + if line == '[DONE]': + break + + try: + chunk_data = json.loads(line) + choices = chunk_data.get('choices', []) + + if choices and len(choices) > 0: + delta = choices[0].get('delta', {}) + content = delta.get('content', '') + + if content: + response_buffer += content + + # 发送响应进度 + yield { + "status": "streaming", + "content": response_buffer + } + + except json.JSONDecodeError: + continue + + # 确保发送完整的最终内容 + yield { + "status": "completed", + "content": response_buffer + } + + finally: + # 恢复原始代理设置 + if original_proxies: + for key, value in original_proxies.items(): + if value is None: + if key in os.environ: + del os.environ[key] + else: + os.environ[key] = value + + except Exception as e: + yield { + "status": "error", + "error": f"豆包图像分析错误: {str(e)}" + } diff --git a/models/factory.py b/models/factory.py index ad5f6e2..96cbfbb 100644 --- a/models/factory.py +++ b/models/factory.py @@ -3,7 +3,8 @@ import json import os import importlib from .base import BaseModel -from .mathpix import MathpixModel # MathpixModel仍然需要直接导入,因为它是特殊工具 +from .mathpix import MathpixModel # MathpixModel需要直接导入,因为它是特殊OCR工具 +from .baidu_ocr import BaiduOCRModel # 百度OCR也是特殊OCR工具,直接导入 class ModelFactory: # 模型基本信息,包含类型和特性 @@ -39,13 +40,25 @@ class ModelFactory: 'description': model_info.get('description', '') } - # 添加Mathpix模型(特殊工具模型) + # 添加特殊OCR工具模型(不在配置文件中定义) + + # 添加Mathpix OCR工具 cls._models['mathpix'] = { 'class': MathpixModel, 'is_multimodal': True, 'is_reasoning': False, 'display_name': 'Mathpix OCR', - 'description': '文本提取工具,适用于数学公式和文本', + 'description': '数学公式识别工具,适用于复杂数学内容', + 'is_ocr_only': True + } + + # 添加百度OCR工具 + cls._models['baidu-ocr'] = { + 'class': BaiduOCRModel, + 'is_multimodal': True, + 'is_reasoning': False, + 'display_name': '百度OCR', + 'description': '通用文字识别工具,支持中文识别', 'is_ocr_only': True } @@ -62,22 +75,36 @@ class ModelFactory: # 不再硬编码模型定义,而是使用空字典 cls._models = {} - # 只保留Mathpix作为基础工具 + # 添加特殊OCR工具(当配置加载失败时的备用) try: - # 导入MathpixModel类 + # 导入并添加Mathpix OCR工具 from .mathpix import MathpixModel - # 添加Mathpix作为基础工具 cls._models['mathpix'] = { 'class': MathpixModel, 'is_multimodal': True, 'is_reasoning': False, 'display_name': 'Mathpix OCR', - 'description': '文本提取工具,适用于数学公式和文本', + 'description': '数学公式识别工具,适用于复杂数学内容', 'is_ocr_only': True } except Exception as e: - print(f"无法加载基础Mathpix工具: {str(e)}") + print(f"无法加载Mathpix OCR工具: {str(e)}") + + # 添加百度OCR工具 + try: + from .baidu_ocr import BaiduOCRModel + + cls._models['baidu-ocr'] = { + 'class': BaiduOCRModel, + 'is_multimodal': True, + 'is_reasoning': False, + 'display_name': '百度OCR', + 'description': '通用文字识别工具,支持中文识别', + 'is_ocr_only': True + } + except Exception as e: + print(f"无法加载百度OCR工具: {str(e)}") @classmethod def create_model(cls, model_name: str, api_key: str, temperature: float = 0.7, @@ -114,6 +141,25 @@ class ModelFactory: ) # 对于阿里巴巴模型,也需要传递正确的模型名称 elif 'qwen' in model_name.lower() or 'qvq' in model_name.lower() or 'alibaba' in model_name.lower(): + return model_class( + api_key=api_key, + temperature=temperature, + system_prompt=system_prompt, + language=language, + model_name=model_name + ) + # 对于Google模型,也需要传递正确的模型名称 + elif 'gemini' in model_name.lower() or 'google' in model_name.lower(): + return model_class( + api_key=api_key, + temperature=temperature, + system_prompt=system_prompt, + language=language, + model_name=model_name, + api_base_url=api_base_url + ) + # 对于豆包模型,也需要传递正确的模型名称 + elif 'doubao' in model_name.lower(): return model_class( api_key=api_key, temperature=temperature, @@ -129,6 +175,13 @@ class ModelFactory: temperature=temperature, system_prompt=system_prompt ) + # 对于百度OCR模型,传递api_key(支持API_KEY:SECRET_KEY格式) + elif model_name == 'baidu-ocr': + return model_class( + api_key=api_key, + temperature=temperature, + system_prompt=system_prompt + ) # 对于Anthropic模型,需要传递model_identifier参数 elif 'claude' in model_name.lower() or 'anthropic' in model_name.lower(): return model_class( diff --git a/models/google.py b/models/google.py index 6904bfe..fe9f210 100644 --- a/models/google.py +++ b/models/google.py @@ -30,10 +30,17 @@ class GoogleModel(BaseModel): # 配置Google API if api_base_url: - # 如果提供了自定义API基础URL,设置genai的api_url - genai.configure(api_key=api_key, transport="rest", client_options={"api_endpoint": api_base_url}) + # 配置中转API - 使用环境变量方式 + # 移除末尾的斜杠以避免重复路径问题 + clean_base_url = api_base_url.rstrip('/') + # 设置环境变量来指定API端点 + os.environ['GOOGLE_AI_API_ENDPOINT'] = clean_base_url + genai.configure(api_key=api_key) else: # 使用默认API端点 + # 清除可能存在的自定义端点环境变量 + if 'GOOGLE_AI_API_ENDPOINT' in os.environ: + del os.environ['GOOGLE_AI_API_ENDPOINT'] genai.configure(api_key=api_key) def get_default_system_prompt(self) -> str: @@ -46,7 +53,7 @@ class GoogleModel(BaseModel): def get_model_identifier(self) -> str: """返回默认的模型标识符""" - return "gemini-2.5-pro-preview-03-25" + return "gemini-2.5-flash" # 使用有免费配额的模型作为默认值 def analyze_text(self, text: str, proxies: dict = None) -> Generator[dict, None, None]: """流式生成文本响应""" diff --git a/static/js/main.js b/static/js/main.js index ed6b8b9..13b2686 100644 --- a/static/js/main.js +++ b/static/js/main.js @@ -1053,10 +1053,33 @@ class SnapSolver { this.extractTextBtn.innerHTML = '提取中...'; const settings = window.settingsManager.getSettings(); + + // 根据用户设置的OCR源进行选择 + const ocrSource = settings.ocrSource || 'auto'; + const baiduApiKey = window.settingsManager.apiKeyValues.BaiduApiKey; + const baiduSecretKey = window.settingsManager.apiKeyValues.BaiduSecretKey; const mathpixApiKey = settings.mathpixApiKey; - if (!mathpixApiKey || mathpixApiKey === ':') { - window.uiManager.showToast('请在设置中输入Mathpix API凭据', 'error'); + const hasBaiduOCR = baiduApiKey && baiduSecretKey; + const hasMathpix = mathpixApiKey && mathpixApiKey !== ':'; + + // 根据OCR源配置检查可用性 + let canProceed = false; + let missingOCRMessage = ''; + + if (ocrSource === 'baidu') { + canProceed = hasBaiduOCR; + missingOCRMessage = '请在设置中配置百度OCR API密钥'; + } else if (ocrSource === 'mathpix') { + canProceed = hasMathpix; + missingOCRMessage = '请在设置中配置Mathpix API密钥'; + } else { // auto + canProceed = hasBaiduOCR || hasMathpix; + missingOCRMessage = '请在设置中配置OCR API密钥:百度OCR(推荐)或Mathpix'; + } + + if (!canProceed) { + window.uiManager.showToast(missingOCRMessage, 'error'); document.getElementById('settingsPanel').classList.add('active'); this.extractTextBtn.disabled = false; this.extractTextBtn.innerHTML = '提取文本'; @@ -1076,7 +1099,7 @@ class SnapSolver { this.socket.emit('extract_text', { image: this.croppedImage.split(',')[1], settings: { - mathpixApiKey: mathpixApiKey + ocrSource: settings.ocrSource || 'auto' } }); diff --git a/static/js/settings.js b/static/js/settings.js index 87c1e2b..b0019c5 100644 --- a/static/js/settings.js +++ b/static/js/settings.js @@ -374,6 +374,9 @@ class SettingsManager { // 模型选择器对象 this.modelSelector = null; + // OCR源配置 + this.ocrSource = 'auto'; // 默认自动选择 + // 存储API密钥的对象 this.apiKeyValues = { 'AnthropicApiKey': '', @@ -381,6 +384,9 @@ class SettingsManager { 'DeepseekApiKey': '', 'AlibabaApiKey': '', 'GoogleApiKey': '', + 'DoubaoApiKey': '', + 'BaiduApiKey': '', + 'BaiduSecretKey': '', 'MathpixAppId': '', 'MathpixAppKey': '' }; @@ -391,7 +397,8 @@ class SettingsManager { 'OpenaiApiBaseUrl': '', 'DeepseekApiBaseUrl': '', 'AlibabaApiBaseUrl': '', - 'GoogleApiBaseUrl': '' + 'GoogleApiBaseUrl': '', + 'DoubaoApiBaseUrl': '' }; // 加载模型配置 @@ -580,6 +587,13 @@ class SettingsManager { this.updateReasoningOptionUI(settings.reasoningDepth); } + // 加载豆包思考模式设置 + if (settings.doubaoThinkingMode && this.doubaoThinkingModeSelect) { + this.doubaoThinkingModeSelect.value = settings.doubaoThinkingMode; + // 更新豆包思考选项UI + this.updateDoubaoThinkingOptionUI(settings.doubaoThinkingMode); + } + // 加载思考预算百分比 const thinkBudgetPercent = parseInt(settings.thinkBudgetPercent || '50'); if (this.thinkBudgetPercentInput) { @@ -624,6 +638,14 @@ class SettingsManager { this.proxyPortInput.value = settings.proxyPort; } + // Load OCR source setting + if (settings.ocrSource) { + this.ocrSource = settings.ocrSource; + if (this.ocrSourceSelect) { + this.ocrSourceSelect.value = settings.ocrSource; + } + } + // Update UI based on model type this.updateUIBasedOnModelType(); @@ -720,6 +742,14 @@ class SettingsManager { this.thinkBudgetGroup.style.display = showThinkBudget ? 'block' : 'none'; } + // 处理豆包深度思考设置显示 + const isDoubaoReasoning = modelInfo.isReasoning && modelInfo.provider === 'doubao'; + + // 只有对豆包推理模型才显示深度思考设置 + if (this.doubaoThinkingGroup) { + this.doubaoThinkingGroup.style.display = isDoubaoReasoning ? 'block' : 'none'; + } + // 控制最大Token设置的显示 // 阿里巴巴模型不支持自定义Token设置 const maxTokensGroup = this.maxTokens ? this.maxTokens.closest('.setting-group') : null; @@ -759,6 +789,8 @@ class SettingsManager { apiKeyToHighlight = document.querySelector('.api-key-status:nth-child(4)'); // Alibaba } else if (modelType && (modelType.toLowerCase().includes('gemini') || modelType.toLowerCase().includes('google'))) { apiKeyToHighlight = document.querySelector('.api-key-status:nth-child(5)'); // Google + } else if (modelType && modelType.toLowerCase().includes('doubao')) { + apiKeyToHighlight = document.querySelector('.api-key-status:nth-child(6)'); // 豆包 } if (apiKeyToHighlight) { @@ -775,6 +807,7 @@ class SettingsManager { model: this.modelSelect.value, maxTokens: this.maxTokens.value, reasoningDepth: this.reasoningDepthSelect?.value || 'standard', + doubaoThinkingMode: this.doubaoThinkingModeSelect?.value || 'auto', thinkBudgetPercent: this.thinkBudgetPercentInput?.value || '50', temperature: this.temperatureInput.value, language: this.languageInput.value, @@ -782,7 +815,8 @@ class SettingsManager { currentPromptId: this.currentPromptId, proxyEnabled: this.proxyEnabledInput.checked, proxyHost: this.proxyHostInput.value, - proxyPort: this.proxyPortInput.value + proxyPort: this.proxyPortInput.value, + ocrSource: this.ocrSource // 添加OCR源配置保存 }; // 保存设置到localStorage @@ -832,17 +866,30 @@ class SettingsManager { const reasoningDepth = this.reasoningDepthSelect?.value || 'standard'; const thinkBudgetPercent = parseInt(this.thinkBudgetPercentInput?.value || '50'); + // 获取豆包思考模式设置 + const doubaoThinkingMode = this.doubaoThinkingModeSelect?.value || 'auto'; + // 计算思考预算的实际Token数 const thinkBudget = Math.floor(maxTokens * (thinkBudgetPercent / 100)); // 构建推理配置参数 const reasoningConfig = {}; - if (modelInfo.provider === 'anthropic' && modelInfo.isReasoning) { - if (reasoningDepth === 'extended') { - reasoningConfig.reasoning_depth = 'extended'; - reasoningConfig.think_budget = thinkBudget; - } else { - reasoningConfig.speed_mode = 'instant'; + + // 处理不同模型的推理配置 + if (modelInfo.isReasoning) { + // 对于Anthropic模型 + if (modelInfo.provider === 'anthropic') { + if (reasoningDepth === 'extended') { + reasoningConfig.reasoning_depth = 'extended'; + reasoningConfig.think_budget = thinkBudget; + } else { + reasoningConfig.speed_mode = 'instant'; + } + } + + // 对于豆包模型 + if (modelInfo.provider === 'doubao') { + reasoningConfig.thinking_mode = doubaoThinkingMode; } } @@ -869,6 +916,9 @@ class SettingsManager { if (this.apiBaseUrlValues['GoogleApiBaseUrl']) { apiBaseUrls.google = this.apiBaseUrlValues['GoogleApiBaseUrl']; } + if (this.apiBaseUrlValues['DoubaoApiBaseUrl']) { + apiBaseUrls.doubao = this.apiBaseUrlValues['DoubaoApiBaseUrl']; + } } return { @@ -881,6 +931,8 @@ class SettingsManager { proxyHost: this.proxyHostInput.value, proxyPort: this.proxyPortInput.value, mathpixApiKey: mathpixApiKey, + ocrSource: this.ocrSource, // 添加OCR源配置 + doubaoThinkingMode: doubaoThinkingMode, // 添加豆包思考模式配置 modelInfo: { supportsMultimodal: modelInfo.supportsMultimodal || false, isReasoning: modelInfo.isReasoning || false, @@ -1121,6 +1173,20 @@ class SettingsManager { this.saveSettings(); }); + // OCR源选择器事件监听 + if (this.ocrSourceSelect) { + this.ocrSourceSelect.addEventListener('change', (e) => { + // 阻止事件冒泡 + e.stopPropagation(); + + // 更新OCR源配置 + this.ocrSource = e.target.value; + this.saveSettings(); + + console.log('OCR源已切换为:', this.ocrSource); + }); + } + // Panel visibility if (this.settingsToggle) { this.settingsToggle.addEventListener('click', () => { @@ -1195,6 +1261,71 @@ class SettingsManager { // 初始化API密钥编辑功能 this.initApiKeyEditFunctions(); + + // 初始化推理选项事件 + this.initReasoningOptionEvents(); + + // 初始化豆包思考选项事件 + this.initDoubaoThinkingOptionEvents(); + } + + // 初始化推理选项事件 + initReasoningOptionEvents() { + const reasoningOptions = document.querySelectorAll('.reasoning-option'); + reasoningOptions.forEach(option => { + option.addEventListener('click', (e) => { + e.preventDefault(); + e.stopPropagation(); + + const value = option.getAttribute('data-value'); + if (value && this.reasoningDepthSelect) { + // 更新select值 + this.reasoningDepthSelect.value = value; + + // 更新UI + this.updateReasoningOptionUI(value); + + // 保存设置 + this.saveSettings(); + } + }); + }); + } + + // 初始化豆包思考选项事件 + initDoubaoThinkingOptionEvents() { + const doubaoThinkingOptions = document.querySelectorAll('.doubao-thinking-option'); + doubaoThinkingOptions.forEach(option => { + option.addEventListener('click', (e) => { + e.preventDefault(); + e.stopPropagation(); + + const value = option.getAttribute('data-value'); + if (value && this.doubaoThinkingModeSelect) { + // 更新select值 + this.doubaoThinkingModeSelect.value = value; + + // 更新UI + this.updateDoubaoThinkingOptionUI(value); + + // 保存设置 + this.saveSettings(); + } + }); + }); + } + + // 更新豆包思考选项UI + updateDoubaoThinkingOptionUI(value) { + const doubaoThinkingOptions = document.querySelectorAll('.doubao-thinking-option'); + doubaoThinkingOptions.forEach(option => { + const optionValue = option.getAttribute('data-value'); + if (optionValue === value) { + option.classList.add('active'); + } else { + option.classList.remove('active'); + } + }); } // 更新思考预算显示 @@ -2208,10 +2339,17 @@ class SettingsManager { this.thinkBudgetPercentValue = document.getElementById('thinkBudgetPercentValue'); this.thinkBudgetGroup = document.querySelector('.think-budget-group'); + // 豆包深度思考相关元素 + this.doubaoThinkingModeSelect = document.getElementById('doubaoThinkingMode'); + this.doubaoThinkingGroup = document.querySelector('.doubao-thinking-group'); + // Initialize Mathpix inputs this.mathpixAppIdInput = document.getElementById('mathpixAppId'); this.mathpixAppKeyInput = document.getElementById('mathpixAppKey'); + // OCR源选择器 + this.ocrSourceSelect = document.getElementById('ocrSourceSelect'); + // API Key elements - 所有的密钥输入框 this.apiKeyInputs = { 'AnthropicApiKey': document.getElementById('AnthropicApiKey'), @@ -2260,6 +2398,9 @@ class SettingsManager { 'DeepseekApiKey': '', 'AlibabaApiKey': '', 'GoogleApiKey': '', + 'DoubaoApiKey': '', + 'BaiduApiKey': '', + 'BaiduSecretKey': '', 'MathpixAppId': '', 'MathpixAppKey': '' }; @@ -2359,7 +2500,8 @@ class SettingsManager { 'OpenaiApiBaseUrl': proxyApiConfig.apis?.openai || '', 'DeepseekApiBaseUrl': proxyApiConfig.apis?.deepseek || '', 'AlibabaApiBaseUrl': proxyApiConfig.apis?.alibaba || '', - 'GoogleApiBaseUrl': proxyApiConfig.apis?.google || '' + 'GoogleApiBaseUrl': proxyApiConfig.apis?.google || '', + 'DoubaoApiBaseUrl': proxyApiConfig.apis?.doubao || '' }; this.updateApiBaseUrlStatus(apiBaseUrls); console.log('API基础URL状态已刷新'); @@ -2449,6 +2591,9 @@ class SettingsManager { case 'GoogleApiBaseUrl': config.apis.google = value; break; + case 'DoubaoApiBaseUrl': + config.apis.doubao = value; + break; } // 确保启用中转API diff --git a/static/style.css b/static/style.css index 7ba0233..2df0e26 100644 --- a/static/style.css +++ b/static/style.css @@ -2174,6 +2174,82 @@ button:disabled { transition: all 0.2s ease-in-out; } +/* OCR设置样式 */ +.ocr-settings { + margin-bottom: 1.5rem; +} + +.ocr-source-control { + display: flex; + flex-direction: column; + gap: 12px; +} + +.ocr-source-selector { + position: relative; +} + +.ocr-source-select { + width: 100%; + padding: 10px 14px; + border: 1px solid var(--border-color); + border-radius: 8px; + background: var(--surface); + color: var(--text-primary); + font-size: 0.9rem; + transition: all 0.2s ease; +} + +.ocr-source-select:hover { + border-color: var(--primary-color); + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); +} + +.ocr-source-select:focus { + outline: none; + border-color: var(--primary-color); + box-shadow: 0 0 0 3px rgba(var(--primary-rgb), 0.1); +} + +.ocr-source-description { + display: flex; + flex-direction: column; + gap: 8px; + padding: 12px; + background: rgba(0, 0, 0, 0.02); + border-radius: 8px; + border: 1px solid var(--border-color); +} + +.ocr-desc-item { + display: flex; + align-items: flex-start; + gap: 8px; + font-size: 0.85rem; + line-height: 1.4; + color: var(--text-secondary); +} + +.ocr-desc-item i { + color: var(--primary-color); + margin-top: 2px; + flex-shrink: 0; +} + +.ocr-desc-item strong { + color: var(--text-primary); +} + +/* 暗色主题下的OCR设置样式 */ +[data-theme="dark"] .ocr-source-description { + background: rgba(255, 255, 255, 0.02); +} + +[data-theme="dark"] .ocr-source-select { + background: var(--surface); + border-color: var(--border-color); +} + /* 新增的推理控制组件样式 */ .reasoning-control { display: flex; @@ -2260,6 +2336,122 @@ button:disabled { opacity: 1; } +/* 豆包深度思考控制组件样式 */ +.doubao-thinking-control { + display: flex; + flex-direction: column; + gap: 8px; +} + +.doubao-thinking-label { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 6px; +} + +.doubao-thinking-selector { + display: flex; + gap: 8px; + margin-bottom: 8px; +} + +.doubao-thinking-option { + flex: 1; + display: flex; + flex-direction: column; + align-items: center; + padding: 12px 8px; + border-radius: 8px; + background: rgba(0, 0, 0, 0.05); + cursor: pointer; + transition: all 0.2s ease; + border: 2px solid transparent; + position: relative; + overflow: hidden; + min-height: 80px; + justify-content: center; +} + +.doubao-thinking-option::before { + content: ''; + position: absolute; + bottom: 0; + left: 0; + width: 100%; + height: 3px; + background: linear-gradient(to right, var(--primary-color), transparent); + opacity: 0; + transition: opacity 0.3s ease; +} + +.doubao-thinking-option:hover { + transform: translateY(-2px); + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); +} + +.doubao-thinking-option.active { + background: rgba(var(--primary-rgb), 0.1); + border-color: var(--primary-color); +} + +.doubao-thinking-option.active::before { + opacity: 1; +} + +.doubao-thinking-option i { + font-size: 1.3rem; + margin-bottom: 6px; + color: var(--primary-color); + opacity: 0.8; + transition: all 0.2s ease; +} + +.doubao-thinking-option .option-name { + font-weight: 600; + font-size: 0.85rem; + margin-bottom: 4px; + text-align: center; +} + +.doubao-thinking-option .option-desc { + font-size: 0.7rem; + opacity: 0.7; + text-align: center; + line-height: 1.2; +} + +.doubao-thinking-option:hover i { + transform: scale(1.1); + opacity: 1; +} + +.doubao-thinking-desc { + display: flex; + flex-direction: column; + gap: 6px; + margin-top: 8px; + padding: 8px; + background: rgba(0, 0, 0, 0.03); + border-radius: 6px; +} + +.doubao-desc-item { + display: flex; + align-items: center; + gap: 8px; + font-size: 0.8rem; + opacity: 0.8; +} + +.doubao-desc-item i { + font-size: 0.9rem; + color: var(--primary-color); + opacity: 0.7; + width: 16px; + text-align: center; +} + /* 思考预算控制组件样式 */ .think-budget-control { display: flex; @@ -2411,6 +2603,18 @@ button:disabled { background: rgba(var(--primary-rgb), 0.2); } +[data-theme="dark"] .doubao-thinking-option { + background: rgba(255, 255, 255, 0.05); +} + +[data-theme="dark"] .doubao-thinking-option.active { + background: rgba(var(--primary-rgb), 0.2); +} + +[data-theme="dark"] .doubao-thinking-desc { + background: rgba(255, 255, 255, 0.03); +} + [data-theme="dark"] .think-value-badge { background: rgba(255, 255, 255, 0.1); } diff --git a/templates/index.html b/templates/index.html index 5432c2e..0a2f075 100644 --- a/templates/index.html +++ b/templates/index.html @@ -219,6 +219,49 @@ +
    @@ -252,6 +295,7 @@
    +
    @@ -309,6 +353,37 @@
    + +
    +

    OCR设置

    +
    +
    + +
    + +
    +
    +
    + + 自动选择:优先使用百度OCR,如无配置则使用Mathpix +
    +
    + + 百度OCR:支持中文,免费额度大,推荐使用 +
    +
    + + Mathpix:专业数学公式识别,支持LaTeX格式 +
    +
    +
    +
    +
    +

    API密钥设置

    @@ -425,6 +500,75 @@
    +
    + Doubao API: +
    + +
    + 未设置 + +
    + + +
    +
    + + +
    + 百度OCR API Key: +
    + +
    + 未设置 + +
    + + +
    +
    +
    + 百度OCR Secret Key: +
    + +
    + 未设置 + +
    + + +
    +
    +
    Mathpix App ID:
    @@ -577,6 +721,25 @@
    +
    + Doubao API URL: +
    + +
    + 未设置 + +
    + + +
    +