添加百度OCR支持,更新OCR源选择和设置界面

This commit is contained in:
skestar
2025-08-03 00:19:09 +08:00
parent aef6e2abef
commit 6afe56c816
7 changed files with 480 additions and 31 deletions

65
app.py
View File

@@ -324,39 +324,66 @@ def handle_text_extraction(data):
if not isinstance(settings, dict):
raise ValueError("Invalid settings format")
# 尝试从本地配置获取Mathpix API密钥
mathpix_app_id = get_api_key('MathpixAppId')
mathpix_app_key = get_api_key('MathpixAppKey')
# 优先使用百度OCR如果没有配置则使用Mathpix
# 首先尝试获取百度OCR API密钥
baidu_api_key = get_api_key('BaiduApiKey')
baidu_secret_key = get_api_key('BaiduSecretKey')
# 构建完整的Mathpix API密钥格式app_id:app_key
mathpix_key = f"{mathpix_app_id}:{mathpix_app_key}" if mathpix_app_id and mathpix_app_key else None
# 构建百度OCR API密钥格式api_key:secret_key
ocr_key = None
ocr_model = None
# 如果本地没有配置,尝试使用前端传递的密钥(向后兼容)
if not mathpix_key:
mathpix_key = settings.get('mathpixApiKey')
if baidu_api_key and baidu_secret_key:
ocr_key = f"{baidu_api_key}:{baidu_secret_key}"
ocr_model = 'baidu-ocr'
print("Using Baidu OCR for text extraction...")
else:
# 回退到Mathpix
mathpix_app_id = get_api_key('MathpixAppId')
mathpix_app_key = get_api_key('MathpixAppKey')
# 构建完整的Mathpix API密钥格式app_id:app_key
mathpix_key = f"{mathpix_app_id}:{mathpix_app_key}" if mathpix_app_id and mathpix_app_key else None
# 如果本地没有配置,尝试使用前端传递的密钥(向后兼容)
if not mathpix_key:
mathpix_key = settings.get('mathpixApiKey')
if mathpix_key:
ocr_key = mathpix_key
ocr_model = 'mathpix'
print("Using Mathpix OCR for text extraction...")
if not mathpix_key:
raise ValueError("Mathpix API key is required")
if not ocr_key:
raise ValueError("OCR API key is required. Please configure Baidu OCR (API Key + Secret Key) or Mathpix (App ID + App Key)")
# 先回复客户端,确认已收到请求,防止超时断开
# 注意这里不能使用return否则后续代码不会执行
socketio.emit('request_acknowledged', {
'status': 'received',
'message': 'Image received, text extraction in progress'
'message': f'Image received, text extraction in progress using {ocr_model}'
}, room=request.sid)
try:
app_id, app_key = mathpix_key.split(':')
if not app_id.strip() or not app_key.strip():
raise ValueError()
if ocr_model == 'baidu-ocr':
api_key, secret_key = ocr_key.split(':')
if not api_key.strip() or not secret_key.strip():
raise ValueError()
elif ocr_model == 'mathpix':
app_id, app_key = ocr_key.split(':')
if not app_id.strip() or not app_key.strip():
raise ValueError()
except ValueError:
raise ValueError("Invalid Mathpix API key format. Expected format: 'app_id:app_key'")
if ocr_model == 'baidu-ocr':
raise ValueError("Invalid Baidu OCR API key format. Expected format: 'API_KEY:SECRET_KEY'")
else:
raise ValueError("Invalid Mathpix API key format. Expected format: 'app_id:app_key'")
print("Creating Mathpix model instance...")
# 只传递必需的参数,ModelFactory.create_model会处理不同模型类型
print(f"Creating {ocr_model} model instance...")
# ModelFactory.create_model会处理不同模型类型
model = ModelFactory.create_model(
model_name='mathpix',
api_key=mathpix_key
model_name=ocr_model,
api_key=ocr_key
)
print("Starting text extraction...")

177
models/baidu_ocr.py Normal file
View File

@@ -0,0 +1,177 @@
import base64
import json
import time
import urllib.request
import urllib.parse
from typing import Generator, Dict, Any
from .base import BaseModel
class BaiduOCRModel(BaseModel):
"""
百度OCR模型用于图像文字识别
"""
def __init__(self, api_key: str, secret_key: str = None, temperature: float = 0.7, system_prompt: str = None):
"""
初始化百度OCR模型
Args:
api_key: 百度API Key
secret_key: 百度Secret Key可以在api_key中用冒号分隔传入
temperature: 不用于OCR但保持BaseModel兼容性
system_prompt: 不用于OCR但保持BaseModel兼容性
Raises:
ValueError: 如果API密钥格式无效
"""
super().__init__(api_key, temperature, system_prompt)
# 支持两种格式单独传递或在api_key中用冒号分隔
if secret_key:
self.api_key = api_key
self.secret_key = secret_key
else:
try:
self.api_key, self.secret_key = api_key.split(':')
except ValueError:
raise ValueError("百度OCR API密钥必须是 'API_KEY:SECRET_KEY' 格式或单独传递secret_key参数")
# 百度API URLs
self.token_url = "https://aip.baidubce.com/oauth/2.0/token"
self.ocr_url = "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic"
# 缓存access_token
self._access_token = None
self._token_expires = 0
def get_access_token(self) -> str:
"""获取百度API的access_token"""
# 检查是否需要刷新token提前5分钟刷新
if self._access_token and time.time() < self._token_expires - 300:
return self._access_token
# 请求新的access_token
params = {
'grant_type': 'client_credentials',
'client_id': self.api_key,
'client_secret': self.secret_key
}
data = urllib.parse.urlencode(params).encode('utf-8')
request = urllib.request.Request(self.token_url, data=data)
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
try:
with urllib.request.urlopen(request) as response:
result = json.loads(response.read().decode('utf-8'))
if 'access_token' in result:
self._access_token = result['access_token']
# 设置过期时间默认30天但我们提前刷新
self._token_expires = time.time() + result.get('expires_in', 2592000)
return self._access_token
else:
raise Exception(f"获取access_token失败: {result.get('error_description', '未知错误')}")
except Exception as e:
raise Exception(f"请求access_token失败: {str(e)}")
def ocr_image(self, image_data: str) -> str:
"""
对图像进行OCR识别
Args:
image_data: Base64编码的图像数据
Returns:
str: 识别出的文字内容
"""
access_token = self.get_access_token()
# 准备请求数据
params = {
'image': image_data,
'language_type': 'auto_detect', # 自动检测语言
'detect_direction': 'true', # 检测图像朝向
'probability': 'false' # 不返回置信度(减少响应大小)
}
data = urllib.parse.urlencode(params).encode('utf-8')
url = f"{self.ocr_url}?access_token={access_token}"
request = urllib.request.Request(url, data=data)
request.add_header('Content-Type', 'application/x-www-form-urlencoded')
try:
with urllib.request.urlopen(request) as response:
result = json.loads(response.read().decode('utf-8'))
if 'error_code' in result:
raise Exception(f"百度OCR API错误: {result.get('error_msg', '未知错误')}")
# 提取识别的文字
words_result = result.get('words_result', [])
text_lines = [item['words'] for item in words_result]
return '\n'.join(text_lines)
except Exception as e:
raise Exception(f"OCR识别失败: {str(e)}")
def extract_full_text(self, image_data: str) -> str:
"""
提取图像中的完整文本与Mathpix兼容的接口
Args:
image_data: Base64编码的图像数据
Returns:
str: 提取的文本内容
"""
return self.ocr_image(image_data)
def analyze_image(self, image_data: str, proxies: dict = None) -> Generator[Dict[str, Any], None, None]:
"""
分析图像并返回OCR结果流式输出以保持接口一致性
Args:
image_data: Base64编码的图像数据
proxies: 代理配置(未使用)
Yields:
dict: 包含OCR结果的响应
"""
try:
text = self.ocr_image(image_data)
yield {
'status': 'completed',
'content': text,
'model': 'baidu-ocr'
}
except Exception as e:
yield {
'status': 'error',
'content': f'OCR识别失败: {str(e)}',
'model': 'baidu-ocr'
}
def analyze_text(self, text: str, proxies: dict = None) -> Generator[Dict[str, Any], None, None]:
"""
分析文本OCR模型不支持文本分析
Args:
text: 输入文本
proxies: 代理配置(未使用)
Yields:
dict: 错误响应
"""
yield {
'status': 'error',
'content': 'OCR模型不支持文本分析功能',
'model': 'baidu-ocr'
}
def get_model_identifier(self) -> str:
"""返回模型标识符"""
return "baidu-ocr"

View File

@@ -3,7 +3,8 @@ import json
import os
import importlib
from .base import BaseModel
from .mathpix import MathpixModel # MathpixModel仍然需要直接导入,因为它是特殊工具
from .mathpix import MathpixModel # MathpixModel需要直接导入因为它是特殊OCR工具
from .baidu_ocr import BaiduOCRModel # 百度OCR也是特殊OCR工具直接导入
class ModelFactory:
# 模型基本信息,包含类型和特性
@@ -39,13 +40,25 @@ class ModelFactory:
'description': model_info.get('description', '')
}
# 添加Mathpix模型特殊工具模型
# 添加特殊OCR工具模型不在配置文件中定义
# 添加Mathpix OCR工具
cls._models['mathpix'] = {
'class': MathpixModel,
'is_multimodal': True,
'is_reasoning': False,
'display_name': 'Mathpix OCR',
'description': '文本提取工具,适用于数学公式和文本',
'description': '数学公式识别工具,适用于复杂数学内容',
'is_ocr_only': True
}
# 添加百度OCR工具
cls._models['baidu-ocr'] = {
'class': BaiduOCRModel,
'is_multimodal': True,
'is_reasoning': False,
'display_name': '百度OCR',
'description': '通用文字识别工具,支持中文识别',
'is_ocr_only': True
}
@@ -62,22 +75,36 @@ class ModelFactory:
# 不再硬编码模型定义,而是使用空字典
cls._models = {}
# 只保留Mathpix作为基础工具
# 添加特殊OCR工具当配置加载失败时的备用
try:
# 导入MathpixModel类
# 导入并添加Mathpix OCR工具
from .mathpix import MathpixModel
# 添加Mathpix作为基础工具
cls._models['mathpix'] = {
'class': MathpixModel,
'is_multimodal': True,
'is_reasoning': False,
'display_name': 'Mathpix OCR',
'description': '文本提取工具,适用于数学公式和文本',
'description': '数学公式识别工具,适用于复杂数学内容',
'is_ocr_only': True
}
except Exception as e:
print(f"无法加载基础Mathpix工具: {str(e)}")
print(f"无法加载Mathpix OCR工具: {str(e)}")
# 添加百度OCR工具
try:
from .baidu_ocr import BaiduOCRModel
cls._models['baidu-ocr'] = {
'class': BaiduOCRModel,
'is_multimodal': True,
'is_reasoning': False,
'display_name': '百度OCR',
'description': '通用文字识别工具,支持中文识别',
'is_ocr_only': True
}
except Exception as e:
print(f"无法加载百度OCR工具: {str(e)}")
@classmethod
def create_model(cls, model_name: str, api_key: str, temperature: float = 0.7,
@@ -148,6 +175,13 @@ class ModelFactory:
temperature=temperature,
system_prompt=system_prompt
)
# 对于百度OCR模型传递api_key支持API_KEY:SECRET_KEY格式
elif model_name == 'baidu-ocr':
return model_class(
api_key=api_key,
temperature=temperature,
system_prompt=system_prompt
)
# 对于Anthropic模型需要传递model_identifier参数
elif 'claude' in model_name.lower() or 'anthropic' in model_name.lower():
return model_class(

View File

@@ -1053,10 +1053,33 @@ class SnapSolver {
this.extractTextBtn.innerHTML = '<i class="fas fa-spinner fa-spin"></i><span>提取中...</span>';
const settings = window.settingsManager.getSettings();
// 根据用户设置的OCR源进行选择
const ocrSource = settings.ocrSource || 'auto';
const baiduApiKey = window.settingsManager.apiKeyValues.BaiduApiKey;
const baiduSecretKey = window.settingsManager.apiKeyValues.BaiduSecretKey;
const mathpixApiKey = settings.mathpixApiKey;
if (!mathpixApiKey || mathpixApiKey === ':') {
window.uiManager.showToast('请在设置中输入Mathpix API凭据', 'error');
const hasBaiduOCR = baiduApiKey && baiduSecretKey;
const hasMathpix = mathpixApiKey && mathpixApiKey !== ':';
// 根据OCR源配置检查可用性
let canProceed = false;
let missingOCRMessage = '';
if (ocrSource === 'baidu') {
canProceed = hasBaiduOCR;
missingOCRMessage = '请在设置中配置百度OCR API密钥';
} else if (ocrSource === 'mathpix') {
canProceed = hasMathpix;
missingOCRMessage = '请在设置中配置Mathpix API密钥';
} else { // auto
canProceed = hasBaiduOCR || hasMathpix;
missingOCRMessage = '请在设置中配置OCR API密钥百度OCR推荐或Mathpix';
}
if (!canProceed) {
window.uiManager.showToast(missingOCRMessage, 'error');
document.getElementById('settingsPanel').classList.add('active');
this.extractTextBtn.disabled = false;
this.extractTextBtn.innerHTML = '<i class="fas fa-font"></i><span>提取文本</span>';
@@ -1076,7 +1099,7 @@ class SnapSolver {
this.socket.emit('extract_text', {
image: this.croppedImage.split(',')[1],
settings: {
mathpixApiKey: mathpixApiKey
ocrSource: settings.ocrSource || 'auto'
}
});

View File

@@ -374,6 +374,9 @@ class SettingsManager {
// 模型选择器对象
this.modelSelector = null;
// OCR源配置
this.ocrSource = 'auto'; // 默认自动选择
// 存储API密钥的对象
this.apiKeyValues = {
'AnthropicApiKey': '',
@@ -382,6 +385,8 @@ class SettingsManager {
'AlibabaApiKey': '',
'GoogleApiKey': '',
'DoubaoApiKey': '',
'BaiduApiKey': '',
'BaiduSecretKey': '',
'MathpixAppId': '',
'MathpixAppKey': ''
};
@@ -626,6 +631,14 @@ class SettingsManager {
this.proxyPortInput.value = settings.proxyPort;
}
// Load OCR source setting
if (settings.ocrSource) {
this.ocrSource = settings.ocrSource;
if (this.ocrSourceSelect) {
this.ocrSourceSelect.value = settings.ocrSource;
}
}
// Update UI based on model type
this.updateUIBasedOnModelType();
@@ -786,7 +799,8 @@ class SettingsManager {
currentPromptId: this.currentPromptId,
proxyEnabled: this.proxyEnabledInput.checked,
proxyHost: this.proxyHostInput.value,
proxyPort: this.proxyPortInput.value
proxyPort: this.proxyPortInput.value,
ocrSource: this.ocrSource // 添加OCR源配置保存
};
// 保存设置到localStorage
@@ -888,6 +902,7 @@ class SettingsManager {
proxyHost: this.proxyHostInput.value,
proxyPort: this.proxyPortInput.value,
mathpixApiKey: mathpixApiKey,
ocrSource: this.ocrSource, // 添加OCR源配置
modelInfo: {
supportsMultimodal: modelInfo.supportsMultimodal || false,
isReasoning: modelInfo.isReasoning || false,
@@ -1128,6 +1143,20 @@ class SettingsManager {
this.saveSettings();
});
// OCR源选择器事件监听
if (this.ocrSourceSelect) {
this.ocrSourceSelect.addEventListener('change', (e) => {
// 阻止事件冒泡
e.stopPropagation();
// 更新OCR源配置
this.ocrSource = e.target.value;
this.saveSettings();
console.log('OCR源已切换为:', this.ocrSource);
});
}
// Panel visibility
if (this.settingsToggle) {
this.settingsToggle.addEventListener('click', () => {
@@ -2219,6 +2248,9 @@ class SettingsManager {
this.mathpixAppIdInput = document.getElementById('mathpixAppId');
this.mathpixAppKeyInput = document.getElementById('mathpixAppKey');
// OCR源选择器
this.ocrSourceSelect = document.getElementById('ocrSourceSelect');
// API Key elements - 所有的密钥输入框
this.apiKeyInputs = {
'AnthropicApiKey': document.getElementById('AnthropicApiKey'),
@@ -2268,6 +2300,8 @@ class SettingsManager {
'AlibabaApiKey': '',
'GoogleApiKey': '',
'DoubaoApiKey': '',
'BaiduApiKey': '',
'BaiduSecretKey': '',
'MathpixAppId': '',
'MathpixAppKey': ''
};

View File

@@ -2174,6 +2174,82 @@ button:disabled {
transition: all 0.2s ease-in-out;
}
/* OCR设置样式 */
.ocr-settings {
margin-bottom: 1.5rem;
}
.ocr-source-control {
display: flex;
flex-direction: column;
gap: 12px;
}
.ocr-source-selector {
position: relative;
}
.ocr-source-select {
width: 100%;
padding: 10px 14px;
border: 1px solid var(--border-color);
border-radius: 8px;
background: var(--surface);
color: var(--text-primary);
font-size: 0.9rem;
transition: all 0.2s ease;
}
.ocr-source-select:hover {
border-color: var(--primary-color);
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
}
.ocr-source-select:focus {
outline: none;
border-color: var(--primary-color);
box-shadow: 0 0 0 3px rgba(var(--primary-rgb), 0.1);
}
.ocr-source-description {
display: flex;
flex-direction: column;
gap: 8px;
padding: 12px;
background: rgba(0, 0, 0, 0.02);
border-radius: 8px;
border: 1px solid var(--border-color);
}
.ocr-desc-item {
display: flex;
align-items: flex-start;
gap: 8px;
font-size: 0.85rem;
line-height: 1.4;
color: var(--text-secondary);
}
.ocr-desc-item i {
color: var(--primary-color);
margin-top: 2px;
flex-shrink: 0;
}
.ocr-desc-item strong {
color: var(--text-primary);
}
/* 暗色主题下的OCR设置样式 */
[data-theme="dark"] .ocr-source-description {
background: rgba(255, 255, 255, 0.02);
}
[data-theme="dark"] .ocr-source-select {
background: var(--surface);
border-color: var(--border-color);
}
/* 新增的推理控制组件样式 */
.reasoning-control {
display: flex;

View File

@@ -309,6 +309,37 @@
</div>
</div>
<!-- OCR设置部分 -->
<div class="settings-section ocr-settings">
<h3><i class="fas fa-font"></i> OCR设置</h3>
<div class="setting-group">
<div class="ocr-source-control">
<label for="ocrSourceSelect"><i class="fas fa-eye"></i> OCR工具源</label>
<div class="ocr-source-selector">
<select id="ocrSourceSelect" class="ocr-source-select">
<option value="auto">自动选择</option>
<option value="baidu">百度OCR</option>
<option value="mathpix">Mathpix</option>
</select>
</div>
<div class="ocr-source-description">
<div class="ocr-desc-item">
<i class="fas fa-magic"></i>
<span><strong>自动选择:</strong>优先使用百度OCR如无配置则使用Mathpix</span>
</div>
<div class="ocr-desc-item">
<i class="fas fa-language"></i>
<span><strong>百度OCR</strong>支持中文,免费额度大,推荐使用</span>
</div>
<div class="ocr-desc-item">
<i class="fas fa-square-root-alt"></i>
<span><strong>Mathpix</strong>专业数学公式识别支持LaTeX格式</span>
</div>
</div>
</div>
</div>
</div>
<!-- 2. 所有API密钥集中在一个区域 -->
<div class="settings-section api-key-settings">
<h3><i class="fas fa-key"></i> API密钥设置</h3>
@@ -447,6 +478,53 @@
</div>
</div>
</div>
<!-- 百度OCR API Key配置 -->
<div class="api-key-status">
<span class="key-name">百度OCR API Key:</span>
<div class="key-status-wrapper">
<!-- 显示状态 -->
<div class="key-display">
<span id="BaiduApiKeyStatus" class="key-status" data-key="BaiduApiKey">未设置</span>
<button class="btn-icon edit-api-key" data-key-type="BaiduApiKey" title="编辑此密钥">
<i class="fas fa-edit"></i>
</button>
</div>
<!-- 编辑状态 -->
<div class="key-edit hidden">
<input type="password" class="key-input" data-key-type="BaiduApiKey" placeholder="输入百度OCR API Key">
<button class="btn-icon toggle-visibility">
<i class="fas fa-eye"></i>
</button>
<button class="btn-icon save-api-key" data-key-type="BaiduApiKey" title="保存密钥">
<i class="fas fa-save"></i>
</button>
</div>
</div>
</div>
<div class="api-key-status">
<span class="key-name">百度OCR Secret Key:</span>
<div class="key-status-wrapper">
<!-- 显示状态 -->
<div class="key-display">
<span id="BaiduSecretKeyStatus" class="key-status" data-key="BaiduSecretKey">未设置</span>
<button class="btn-icon edit-api-key" data-key-type="BaiduSecretKey" title="编辑此密钥">
<i class="fas fa-edit"></i>
</button>
</div>
<!-- 编辑状态 -->
<div class="key-edit hidden">
<input type="password" class="key-input" data-key-type="BaiduSecretKey" placeholder="输入百度OCR Secret Key">
<button class="btn-icon toggle-visibility">
<i class="fas fa-eye"></i>
</button>
<button class="btn-icon save-api-key" data-key-type="BaiduSecretKey" title="保存密钥">
<i class="fas fa-save"></i>
</button>
</div>
</div>
</div>
<div class="api-key-status">
<span class="key-name">Mathpix App ID:</span>
<div class="key-status-wrapper">