From 6afe56c816a76d9150e967c06db562196e797391 Mon Sep 17 00:00:00 2001 From: skestar Date: Sun, 3 Aug 2025 00:19:09 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E7=99=BE=E5=BA=A6OCR?= =?UTF-8?q?=E6=94=AF=E6=8C=81=EF=BC=8C=E6=9B=B4=E6=96=B0OCR=E6=BA=90?= =?UTF-8?q?=E9=80=89=E6=8B=A9=E5=92=8C=E8=AE=BE=E7=BD=AE=E7=95=8C=E9=9D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app.py | 65 +++++++++++----- models/baidu_ocr.py | 177 ++++++++++++++++++++++++++++++++++++++++++ models/factory.py | 50 ++++++++++-- static/js/main.js | 29 ++++++- static/js/settings.js | 36 ++++++++- static/style.css | 76 ++++++++++++++++++ templates/index.html | 78 +++++++++++++++++++ 7 files changed, 480 insertions(+), 31 deletions(-) create mode 100644 models/baidu_ocr.py diff --git a/app.py b/app.py index fcdbdc2..0160584 100644 --- a/app.py +++ b/app.py @@ -324,39 +324,66 @@ def handle_text_extraction(data): if not isinstance(settings, dict): raise ValueError("Invalid settings format") - # 尝试从本地配置获取Mathpix API密钥 - mathpix_app_id = get_api_key('MathpixAppId') - mathpix_app_key = get_api_key('MathpixAppKey') + # 优先使用百度OCR,如果没有配置则使用Mathpix + # 首先尝试获取百度OCR API密钥 + baidu_api_key = get_api_key('BaiduApiKey') + baidu_secret_key = get_api_key('BaiduSecretKey') - # 构建完整的Mathpix API密钥(格式:app_id:app_key) - mathpix_key = f"{mathpix_app_id}:{mathpix_app_key}" if mathpix_app_id and mathpix_app_key else None + # 构建百度OCR API密钥(格式:api_key:secret_key) + ocr_key = None + ocr_model = None - # 如果本地没有配置,尝试使用前端传递的密钥(向后兼容) - if not mathpix_key: - mathpix_key = settings.get('mathpixApiKey') + if baidu_api_key and baidu_secret_key: + ocr_key = f"{baidu_api_key}:{baidu_secret_key}" + ocr_model = 'baidu-ocr' + print("Using Baidu OCR for text extraction...") + else: + # 回退到Mathpix + mathpix_app_id = get_api_key('MathpixAppId') + mathpix_app_key = get_api_key('MathpixAppKey') + + # 构建完整的Mathpix API密钥(格式:app_id:app_key) + mathpix_key = f"{mathpix_app_id}:{mathpix_app_key}" if mathpix_app_id and mathpix_app_key else None + + # 如果本地没有配置,尝试使用前端传递的密钥(向后兼容) + if not mathpix_key: + mathpix_key = settings.get('mathpixApiKey') + + if mathpix_key: + ocr_key = mathpix_key + ocr_model = 'mathpix' + print("Using Mathpix OCR for text extraction...") - if not mathpix_key: - raise ValueError("Mathpix API key is required") + if not ocr_key: + raise ValueError("OCR API key is required. Please configure Baidu OCR (API Key + Secret Key) or Mathpix (App ID + App Key)") # 先回复客户端,确认已收到请求,防止超时断开 # 注意:这里不能使用return,否则后续代码不会执行 socketio.emit('request_acknowledged', { 'status': 'received', - 'message': 'Image received, text extraction in progress' + 'message': f'Image received, text extraction in progress using {ocr_model}' }, room=request.sid) try: - app_id, app_key = mathpix_key.split(':') - if not app_id.strip() or not app_key.strip(): - raise ValueError() + if ocr_model == 'baidu-ocr': + api_key, secret_key = ocr_key.split(':') + if not api_key.strip() or not secret_key.strip(): + raise ValueError() + elif ocr_model == 'mathpix': + app_id, app_key = ocr_key.split(':') + if not app_id.strip() or not app_key.strip(): + raise ValueError() except ValueError: - raise ValueError("Invalid Mathpix API key format. Expected format: 'app_id:app_key'") + if ocr_model == 'baidu-ocr': + raise ValueError("Invalid Baidu OCR API key format. Expected format: 'API_KEY:SECRET_KEY'") + else: + raise ValueError("Invalid Mathpix API key format. Expected format: 'app_id:app_key'") - print("Creating Mathpix model instance...") - # 只传递必需的参数,ModelFactory.create_model会处理不同模型类型 + print(f"Creating {ocr_model} model instance...") + # ModelFactory.create_model会处理不同模型类型 model = ModelFactory.create_model( - model_name='mathpix', - api_key=mathpix_key + model_name=ocr_model, + api_key=ocr_key ) print("Starting text extraction...") diff --git a/models/baidu_ocr.py b/models/baidu_ocr.py new file mode 100644 index 0000000..ac48057 --- /dev/null +++ b/models/baidu_ocr.py @@ -0,0 +1,177 @@ +import base64 +import json +import time +import urllib.request +import urllib.parse +from typing import Generator, Dict, Any +from .base import BaseModel + +class BaiduOCRModel(BaseModel): + """ + 百度OCR模型,用于图像文字识别 + """ + + def __init__(self, api_key: str, secret_key: str = None, temperature: float = 0.7, system_prompt: str = None): + """ + 初始化百度OCR模型 + + Args: + api_key: 百度API Key + secret_key: 百度Secret Key(可以在api_key中用冒号分隔传入) + temperature: 不用于OCR但保持BaseModel兼容性 + system_prompt: 不用于OCR但保持BaseModel兼容性 + + Raises: + ValueError: 如果API密钥格式无效 + """ + super().__init__(api_key, temperature, system_prompt) + + # 支持两种格式:单独传递或在api_key中用冒号分隔 + if secret_key: + self.api_key = api_key + self.secret_key = secret_key + else: + try: + self.api_key, self.secret_key = api_key.split(':') + except ValueError: + raise ValueError("百度OCR API密钥必须是 'API_KEY:SECRET_KEY' 格式或单独传递secret_key参数") + + # 百度API URLs + self.token_url = "https://aip.baidubce.com/oauth/2.0/token" + self.ocr_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic" + + # 缓存access_token + self._access_token = None + self._token_expires = 0 + + def get_access_token(self) -> str: + """获取百度API的access_token""" + # 检查是否需要刷新token(提前5分钟刷新) + if self._access_token and time.time() < self._token_expires - 300: + return self._access_token + + # 请求新的access_token + params = { + 'grant_type': 'client_credentials', + 'client_id': self.api_key, + 'client_secret': self.secret_key + } + + data = urllib.parse.urlencode(params).encode('utf-8') + request = urllib.request.Request(self.token_url, data=data) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + + try: + with urllib.request.urlopen(request) as response: + result = json.loads(response.read().decode('utf-8')) + + if 'access_token' in result: + self._access_token = result['access_token'] + # 设置过期时间(默认30天,但我们提前刷新) + self._token_expires = time.time() + result.get('expires_in', 2592000) + return self._access_token + else: + raise Exception(f"获取access_token失败: {result.get('error_description', '未知错误')}") + + except Exception as e: + raise Exception(f"请求access_token失败: {str(e)}") + + def ocr_image(self, image_data: str) -> str: + """ + 对图像进行OCR识别 + + Args: + image_data: Base64编码的图像数据 + + Returns: + str: 识别出的文字内容 + """ + access_token = self.get_access_token() + + # 准备请求数据 + params = { + 'image': image_data, + 'language_type': 'auto_detect', # 自动检测语言 + 'detect_direction': 'true', # 检测图像朝向 + 'probability': 'false' # 不返回置信度(减少响应大小) + } + + data = urllib.parse.urlencode(params).encode('utf-8') + url = f"{self.ocr_url}?access_token={access_token}" + + request = urllib.request.Request(url, data=data) + request.add_header('Content-Type', 'application/x-www-form-urlencoded') + + try: + with urllib.request.urlopen(request) as response: + result = json.loads(response.read().decode('utf-8')) + + if 'error_code' in result: + raise Exception(f"百度OCR API错误: {result.get('error_msg', '未知错误')}") + + # 提取识别的文字 + words_result = result.get('words_result', []) + text_lines = [item['words'] for item in words_result] + + return '\n'.join(text_lines) + + except Exception as e: + raise Exception(f"OCR识别失败: {str(e)}") + + def extract_full_text(self, image_data: str) -> str: + """ + 提取图像中的完整文本(与Mathpix兼容的接口) + + Args: + image_data: Base64编码的图像数据 + + Returns: + str: 提取的文本内容 + """ + return self.ocr_image(image_data) + + def analyze_image(self, image_data: str, proxies: dict = None) -> Generator[Dict[str, Any], None, None]: + """ + 分析图像并返回OCR结果(流式输出以保持接口一致性) + + Args: + image_data: Base64编码的图像数据 + proxies: 代理配置(未使用) + + Yields: + dict: 包含OCR结果的响应 + """ + try: + text = self.ocr_image(image_data) + yield { + 'status': 'completed', + 'content': text, + 'model': 'baidu-ocr' + } + except Exception as e: + yield { + 'status': 'error', + 'content': f'OCR识别失败: {str(e)}', + 'model': 'baidu-ocr' + } + + def analyze_text(self, text: str, proxies: dict = None) -> Generator[Dict[str, Any], None, None]: + """ + 分析文本(OCR模型不支持文本分析) + + Args: + text: 输入文本 + proxies: 代理配置(未使用) + + Yields: + dict: 错误响应 + """ + yield { + 'status': 'error', + 'content': 'OCR模型不支持文本分析功能', + 'model': 'baidu-ocr' + } + + def get_model_identifier(self) -> str: + """返回模型标识符""" + return "baidu-ocr" diff --git a/models/factory.py b/models/factory.py index 7e07e92..96cbfbb 100644 --- a/models/factory.py +++ b/models/factory.py @@ -3,7 +3,8 @@ import json import os import importlib from .base import BaseModel -from .mathpix import MathpixModel # MathpixModel仍然需要直接导入,因为它是特殊工具 +from .mathpix import MathpixModel # MathpixModel需要直接导入,因为它是特殊OCR工具 +from .baidu_ocr import BaiduOCRModel # 百度OCR也是特殊OCR工具,直接导入 class ModelFactory: # 模型基本信息,包含类型和特性 @@ -39,13 +40,25 @@ class ModelFactory: 'description': model_info.get('description', '') } - # 添加Mathpix模型(特殊工具模型) + # 添加特殊OCR工具模型(不在配置文件中定义) + + # 添加Mathpix OCR工具 cls._models['mathpix'] = { 'class': MathpixModel, 'is_multimodal': True, 'is_reasoning': False, 'display_name': 'Mathpix OCR', - 'description': '文本提取工具,适用于数学公式和文本', + 'description': '数学公式识别工具,适用于复杂数学内容', + 'is_ocr_only': True + } + + # 添加百度OCR工具 + cls._models['baidu-ocr'] = { + 'class': BaiduOCRModel, + 'is_multimodal': True, + 'is_reasoning': False, + 'display_name': '百度OCR', + 'description': '通用文字识别工具,支持中文识别', 'is_ocr_only': True } @@ -62,22 +75,36 @@ class ModelFactory: # 不再硬编码模型定义,而是使用空字典 cls._models = {} - # 只保留Mathpix作为基础工具 + # 添加特殊OCR工具(当配置加载失败时的备用) try: - # 导入MathpixModel类 + # 导入并添加Mathpix OCR工具 from .mathpix import MathpixModel - # 添加Mathpix作为基础工具 cls._models['mathpix'] = { 'class': MathpixModel, 'is_multimodal': True, 'is_reasoning': False, 'display_name': 'Mathpix OCR', - 'description': '文本提取工具,适用于数学公式和文本', + 'description': '数学公式识别工具,适用于复杂数学内容', 'is_ocr_only': True } except Exception as e: - print(f"无法加载基础Mathpix工具: {str(e)}") + print(f"无法加载Mathpix OCR工具: {str(e)}") + + # 添加百度OCR工具 + try: + from .baidu_ocr import BaiduOCRModel + + cls._models['baidu-ocr'] = { + 'class': BaiduOCRModel, + 'is_multimodal': True, + 'is_reasoning': False, + 'display_name': '百度OCR', + 'description': '通用文字识别工具,支持中文识别', + 'is_ocr_only': True + } + except Exception as e: + print(f"无法加载百度OCR工具: {str(e)}") @classmethod def create_model(cls, model_name: str, api_key: str, temperature: float = 0.7, @@ -148,6 +175,13 @@ class ModelFactory: temperature=temperature, system_prompt=system_prompt ) + # 对于百度OCR模型,传递api_key(支持API_KEY:SECRET_KEY格式) + elif model_name == 'baidu-ocr': + return model_class( + api_key=api_key, + temperature=temperature, + system_prompt=system_prompt + ) # 对于Anthropic模型,需要传递model_identifier参数 elif 'claude' in model_name.lower() or 'anthropic' in model_name.lower(): return model_class( diff --git a/static/js/main.js b/static/js/main.js index ed6b8b9..13b2686 100644 --- a/static/js/main.js +++ b/static/js/main.js @@ -1053,10 +1053,33 @@ class SnapSolver { this.extractTextBtn.innerHTML = '提取中...'; const settings = window.settingsManager.getSettings(); + + // 根据用户设置的OCR源进行选择 + const ocrSource = settings.ocrSource || 'auto'; + const baiduApiKey = window.settingsManager.apiKeyValues.BaiduApiKey; + const baiduSecretKey = window.settingsManager.apiKeyValues.BaiduSecretKey; const mathpixApiKey = settings.mathpixApiKey; - if (!mathpixApiKey || mathpixApiKey === ':') { - window.uiManager.showToast('请在设置中输入Mathpix API凭据', 'error'); + const hasBaiduOCR = baiduApiKey && baiduSecretKey; + const hasMathpix = mathpixApiKey && mathpixApiKey !== ':'; + + // 根据OCR源配置检查可用性 + let canProceed = false; + let missingOCRMessage = ''; + + if (ocrSource === 'baidu') { + canProceed = hasBaiduOCR; + missingOCRMessage = '请在设置中配置百度OCR API密钥'; + } else if (ocrSource === 'mathpix') { + canProceed = hasMathpix; + missingOCRMessage = '请在设置中配置Mathpix API密钥'; + } else { // auto + canProceed = hasBaiduOCR || hasMathpix; + missingOCRMessage = '请在设置中配置OCR API密钥:百度OCR(推荐)或Mathpix'; + } + + if (!canProceed) { + window.uiManager.showToast(missingOCRMessage, 'error'); document.getElementById('settingsPanel').classList.add('active'); this.extractTextBtn.disabled = false; this.extractTextBtn.innerHTML = '提取文本'; @@ -1076,7 +1099,7 @@ class SnapSolver { this.socket.emit('extract_text', { image: this.croppedImage.split(',')[1], settings: { - mathpixApiKey: mathpixApiKey + ocrSource: settings.ocrSource || 'auto' } }); diff --git a/static/js/settings.js b/static/js/settings.js index 9783e04..0f87966 100644 --- a/static/js/settings.js +++ b/static/js/settings.js @@ -374,6 +374,9 @@ class SettingsManager { // 模型选择器对象 this.modelSelector = null; + // OCR源配置 + this.ocrSource = 'auto'; // 默认自动选择 + // 存储API密钥的对象 this.apiKeyValues = { 'AnthropicApiKey': '', @@ -382,6 +385,8 @@ class SettingsManager { 'AlibabaApiKey': '', 'GoogleApiKey': '', 'DoubaoApiKey': '', + 'BaiduApiKey': '', + 'BaiduSecretKey': '', 'MathpixAppId': '', 'MathpixAppKey': '' }; @@ -626,6 +631,14 @@ class SettingsManager { this.proxyPortInput.value = settings.proxyPort; } + // Load OCR source setting + if (settings.ocrSource) { + this.ocrSource = settings.ocrSource; + if (this.ocrSourceSelect) { + this.ocrSourceSelect.value = settings.ocrSource; + } + } + // Update UI based on model type this.updateUIBasedOnModelType(); @@ -786,7 +799,8 @@ class SettingsManager { currentPromptId: this.currentPromptId, proxyEnabled: this.proxyEnabledInput.checked, proxyHost: this.proxyHostInput.value, - proxyPort: this.proxyPortInput.value + proxyPort: this.proxyPortInput.value, + ocrSource: this.ocrSource // 添加OCR源配置保存 }; // 保存设置到localStorage @@ -888,6 +902,7 @@ class SettingsManager { proxyHost: this.proxyHostInput.value, proxyPort: this.proxyPortInput.value, mathpixApiKey: mathpixApiKey, + ocrSource: this.ocrSource, // 添加OCR源配置 modelInfo: { supportsMultimodal: modelInfo.supportsMultimodal || false, isReasoning: modelInfo.isReasoning || false, @@ -1128,6 +1143,20 @@ class SettingsManager { this.saveSettings(); }); + // OCR源选择器事件监听 + if (this.ocrSourceSelect) { + this.ocrSourceSelect.addEventListener('change', (e) => { + // 阻止事件冒泡 + e.stopPropagation(); + + // 更新OCR源配置 + this.ocrSource = e.target.value; + this.saveSettings(); + + console.log('OCR源已切换为:', this.ocrSource); + }); + } + // Panel visibility if (this.settingsToggle) { this.settingsToggle.addEventListener('click', () => { @@ -2219,6 +2248,9 @@ class SettingsManager { this.mathpixAppIdInput = document.getElementById('mathpixAppId'); this.mathpixAppKeyInput = document.getElementById('mathpixAppKey'); + // OCR源选择器 + this.ocrSourceSelect = document.getElementById('ocrSourceSelect'); + // API Key elements - 所有的密钥输入框 this.apiKeyInputs = { 'AnthropicApiKey': document.getElementById('AnthropicApiKey'), @@ -2268,6 +2300,8 @@ class SettingsManager { 'AlibabaApiKey': '', 'GoogleApiKey': '', 'DoubaoApiKey': '', + 'BaiduApiKey': '', + 'BaiduSecretKey': '', 'MathpixAppId': '', 'MathpixAppKey': '' }; diff --git a/static/style.css b/static/style.css index 7ba0233..a85ff46 100644 --- a/static/style.css +++ b/static/style.css @@ -2174,6 +2174,82 @@ button:disabled { transition: all 0.2s ease-in-out; } +/* OCR设置样式 */ +.ocr-settings { + margin-bottom: 1.5rem; +} + +.ocr-source-control { + display: flex; + flex-direction: column; + gap: 12px; +} + +.ocr-source-selector { + position: relative; +} + +.ocr-source-select { + width: 100%; + padding: 10px 14px; + border: 1px solid var(--border-color); + border-radius: 8px; + background: var(--surface); + color: var(--text-primary); + font-size: 0.9rem; + transition: all 0.2s ease; +} + +.ocr-source-select:hover { + border-color: var(--primary-color); + box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1); +} + +.ocr-source-select:focus { + outline: none; + border-color: var(--primary-color); + box-shadow: 0 0 0 3px rgba(var(--primary-rgb), 0.1); +} + +.ocr-source-description { + display: flex; + flex-direction: column; + gap: 8px; + padding: 12px; + background: rgba(0, 0, 0, 0.02); + border-radius: 8px; + border: 1px solid var(--border-color); +} + +.ocr-desc-item { + display: flex; + align-items: flex-start; + gap: 8px; + font-size: 0.85rem; + line-height: 1.4; + color: var(--text-secondary); +} + +.ocr-desc-item i { + color: var(--primary-color); + margin-top: 2px; + flex-shrink: 0; +} + +.ocr-desc-item strong { + color: var(--text-primary); +} + +/* 暗色主题下的OCR设置样式 */ +[data-theme="dark"] .ocr-source-description { + background: rgba(255, 255, 255, 0.02); +} + +[data-theme="dark"] .ocr-source-select { + background: var(--surface); + border-color: var(--border-color); +} + /* 新增的推理控制组件样式 */ .reasoning-control { display: flex; diff --git a/templates/index.html b/templates/index.html index d2eee26..bf1252a 100644 --- a/templates/index.html +++ b/templates/index.html @@ -309,6 +309,37 @@ + +
+

OCR设置

+
+
+ +
+ +
+
+
+ + 自动选择:优先使用百度OCR,如无配置则使用Mathpix +
+
+ + 百度OCR:支持中文,免费额度大,推荐使用 +
+
+ + Mathpix:专业数学公式识别,支持LaTeX格式 +
+
+
+
+
+

API密钥设置

@@ -447,6 +478,53 @@
+ + +
+ 百度OCR API Key: +
+ +
+ 未设置 + +
+ + +
+
+
+ 百度OCR Secret Key: +
+ +
+ 未设置 + +
+ + +
+
+
Mathpix App ID: