feat: add skills and upgrade feishu/dingtalk channel

This commit is contained in:
zhayujie
2026-02-02 00:42:39 +08:00
parent 77c2bfcc1e
commit a8d5309c90
32 changed files with 2931 additions and 200 deletions

View File

@@ -237,8 +237,8 @@ def _build_tooling_section(tools: List[Any], language: str) -> List[str]:
"叙述要求: 保持简洁、信息密度高,避免重复显而易见的步骤。",
"",
"完成标准:",
"- 确保用户的需求得到实际解决,而不仅仅是制定计划",
"- 当任务需要多次工具调用时,持续推进直到完成",
"- 确保用户的需求得到实际解决,而不仅仅是制定计划",
"- 当任务需要多次工具调用时,持续推进直到完成, 解决完后向用户报告结果或回复用户的问题",
"- 每次工具调用后,评估是否已获得足够信息来推进或完成任务",
"- 避免重复调用相同的工具和相同参数获取相同的信息,除非用户明确要求",
"",

View File

@@ -360,6 +360,9 @@ class Agent:
# Update agent's message history from executor
self.messages = executor.messages
# Store executor reference for agent_bridge to access files_to_send
self.stream_executor = executor
# Execute all post-process tools
self._execute_post_process_tools()

View File

@@ -58,6 +58,9 @@ class AgentStreamExecutor:
# Tool failure tracking for retry protection
self.tool_failure_history = [] # List of (tool_name, args_hash, success) tuples
# Track files to send (populated by read tool)
self.files_to_send = [] # List of file metadata dicts
def _emit_event(self, event_type: str, data: dict = None):
"""Emit event"""
@@ -191,21 +194,47 @@ class AgentStreamExecutor:
logger.info(
f"Memory flush recommended: tokens={current_tokens}, turns={self.agent.memory_manager.flush_manager.turn_count}")
# Call LLM
assistant_msg, tool_calls = self._call_llm_stream()
# Call LLM (enable retry_on_empty for better reliability)
assistant_msg, tool_calls = self._call_llm_stream(retry_on_empty=True)
final_response = assistant_msg
# No tool calls, end loop
if not tool_calls:
# 检查是否返回了空响应
if not assistant_msg:
logger.warning(f"[Agent] LLM returned empty response (no content and no tool calls)")
logger.warning(f"[Agent] LLM returned empty response after retry (no content and no tool calls)")
logger.info(f"[Agent] This usually happens when LLM thinks the task is complete after tool execution")
# 生成通用的友好提示
final_response = (
"抱歉,我暂时无法生成回复。请尝试换一种方式描述你的需求,或稍后再试。"
)
logger.info(f"Generated fallback response for empty LLM output")
# 如果之前有工具调用,强制要求 LLM 生成文本回复
if turn > 1:
logger.info(f"[Agent] Requesting explicit response from LLM...")
# 添加一条消息,明确要求回复用户
self.messages.append({
"role": "user",
"content": [{
"type": "text",
"text": "请向用户说明刚才工具执行的结果或回答用户的问题。"
}]
})
# 再调用一次 LLM
assistant_msg, tool_calls = self._call_llm_stream(retry_on_empty=False)
final_response = assistant_msg
# 如果还是空,才使用 fallback
if not assistant_msg and not tool_calls:
logger.warning(f"[Agent] Still empty after explicit request")
final_response = (
"抱歉,我暂时无法生成回复。请尝试换一种方式描述你的需求,或稍后再试。"
)
logger.info(f"Generated fallback response for empty LLM output")
else:
# 第一轮就空回复,直接 fallback
final_response = (
"抱歉,我暂时无法生成回复。请尝试换一种方式描述你的需求,或稍后再试。"
)
logger.info(f"Generated fallback response for empty LLM output")
else:
logger.info(f"💭 {assistant_msg[:150]}{'...' if len(assistant_msg) > 150 else ''}")
@@ -235,6 +264,14 @@ class AgentStreamExecutor:
result = self._execute_tool(tool_call)
tool_results.append(result)
# Check if this is a file to send (from read tool)
if result.get("status") == "success" and isinstance(result.get("result"), dict):
result_data = result.get("result")
if result_data.get("type") == "file_to_send":
# Store file metadata for later sending
self.files_to_send.append(result_data)
logger.info(f"📎 检测到待发送文件: {result_data.get('file_name', result_data.get('path'))}")
# Check for critical error - abort entire conversation
if result.get("status") == "critical_error":
logger.error(f"💥 检测到严重错误,终止对话")
@@ -392,6 +429,7 @@ class AgentStreamExecutor:
# Streaming response
full_content = ""
tool_calls_buffer = {} # {index: {id, name, arguments}}
stop_reason = None # Track why the stream stopped
try:
stream = self.model.call_stream(request)
@@ -404,21 +442,47 @@ class AgentStreamExecutor:
if isinstance(error_data, dict):
error_msg = error_data.get("message", chunk.get("message", "Unknown error"))
error_code = error_data.get("code", "")
error_type = error_data.get("type", "")
else:
error_msg = chunk.get("message", str(error_data))
error_code = ""
error_type = ""
status_code = chunk.get("status_code", "N/A")
logger.error(f"API Error: {error_msg} (Status: {status_code}, Code: {error_code})")
logger.error(f"Full error chunk: {chunk}")
# Raise exception with full error message for retry logic
raise Exception(f"{error_msg} (Status: {status_code})")
# Log error with all available information
logger.error(f"🔴 Stream API Error:")
logger.error(f" Message: {error_msg}")
logger.error(f" Status Code: {status_code}")
logger.error(f" Error Code: {error_code}")
logger.error(f" Error Type: {error_type}")
logger.error(f" Full chunk: {chunk}")
# Check if this is a context overflow error (keyword-based, works for all models)
# Don't rely on specific status codes as different providers use different codes
error_msg_lower = error_msg.lower()
is_overflow = any(keyword in error_msg_lower for keyword in [
'context length exceeded', 'maximum context length', 'prompt is too long',
'context overflow', 'context window', 'too large', 'exceeds model context',
'request_too_large', 'request exceeds the maximum size', 'tokens exceed'
])
if is_overflow:
# Mark as context overflow for special handling
raise Exception(f"[CONTEXT_OVERFLOW] {error_msg} (Status: {status_code})")
else:
# Raise exception with full error message for retry logic
raise Exception(f"{error_msg} (Status: {status_code}, Code: {error_code}, Type: {error_type})")
# Parse chunk
if isinstance(chunk, dict) and "choices" in chunk:
choice = chunk["choices"][0]
delta = choice.get("delta", {})
# Capture finish_reason if present
finish_reason = choice.get("finish_reason")
if finish_reason:
stop_reason = finish_reason
# Handle text content
if "content" in delta and delta["content"]:
@@ -449,9 +513,46 @@ class AgentStreamExecutor:
tool_calls_buffer[index]["arguments"] += func["arguments"]
except Exception as e:
error_str = str(e).lower()
error_str = str(e)
error_str_lower = error_str.lower()
# Check if error is context overflow (non-retryable, needs session reset)
# Method 1: Check for special marker (set in stream error handling above)
is_context_overflow = '[context_overflow]' in error_str_lower
# Method 2: Fallback to keyword matching for non-stream errors
if not is_context_overflow:
is_context_overflow = any(keyword in error_str_lower for keyword in [
'context length exceeded', 'maximum context length', 'prompt is too long',
'context overflow', 'context window', 'too large', 'exceeds model context',
'request_too_large', 'request exceeds the maximum size'
])
# Check if error is message format error (incomplete tool_use/tool_result pairs)
# This happens when previous conversation had tool failures
is_message_format_error = any(keyword in error_str_lower for keyword in [
'tool_use', 'tool_result', 'without', 'immediately after',
'corresponding', 'must have', 'each'
]) and 'status: 400' in error_str_lower
if is_context_overflow or is_message_format_error:
error_type = "context overflow" if is_context_overflow else "message format error"
logger.error(f"💥 {error_type} detected: {e}")
# Clear message history to recover
logger.warning("🔄 Clearing conversation history to recover")
self.messages.clear()
# Raise special exception with user-friendly message
if is_context_overflow:
raise Exception(
"抱歉,对话历史过长导致上下文溢出。我已清空历史记录,请重新描述你的需求。"
)
else:
raise Exception(
"抱歉,之前的对话出现了问题。我已清空历史记录,请重新发送你的消息。"
)
# Check if error is retryable (timeout, connection, rate limit, server busy, etc.)
is_retryable = any(keyword in error_str for keyword in [
is_retryable = any(keyword in error_str_lower for keyword in [
'timeout', 'timed out', 'connection', 'network',
'rate limit', 'overloaded', 'unavailable', 'busy', 'retry',
'429', '500', '502', '503', '504', '512'
@@ -505,11 +606,12 @@ class AgentStreamExecutor:
# Check for empty response and retry once if enabled
if retry_on_empty and not full_content and not tool_calls:
logger.warning(f"⚠️ LLM returned empty response, retrying once...")
logger.warning(f"⚠️ LLM returned empty response (stop_reason: {stop_reason}), retrying once...")
self._emit_event("message_end", {
"content": "",
"tool_calls": [],
"empty_retry": True
"empty_retry": True,
"stop_reason": stop_reason
})
# Retry without retry flag to avoid infinite loop
return self._call_llm_stream(

View File

@@ -137,6 +137,10 @@ class SkillLoader:
name = frontmatter.get('name', parent_dir_name)
description = frontmatter.get('description', '')
# Special handling for linkai-agent: dynamically load apps from config.json
if name == 'linkai-agent':
description = self._load_linkai_agent_description(skill_dir, description)
if not description or not description.strip():
diagnostics.append(f"Skill {name} has no description: {file_path}")
return LoadSkillsResult(skills=[], diagnostics=diagnostics)
@@ -161,6 +165,45 @@ class SkillLoader:
return LoadSkillsResult(skills=[skill], diagnostics=diagnostics)
def _load_linkai_agent_description(self, skill_dir: str, default_description: str) -> str:
"""
Dynamically load LinkAI agent description from config.json
:param skill_dir: Skill directory
:param default_description: Default description from SKILL.md
:return: Dynamic description with app list
"""
import json
config_path = os.path.join(skill_dir, "config.json")
template_path = os.path.join(skill_dir, "config.json.template")
# Try to load config.json or fallback to template
config_file = config_path if os.path.exists(config_path) else template_path
if not os.path.exists(config_file):
return default_description
try:
with open(config_file, 'r', encoding='utf-8') as f:
config = json.load(f)
apps = config.get("apps", [])
if not apps:
return default_description
# Build dynamic description with app details
app_descriptions = "; ".join([
f"{app['app_name']}({app['app_code']}: {app['app_description']})"
for app in apps
])
return f"Call LinkAI apps/workflows. {app_descriptions}"
except Exception as e:
logger.warning(f"[SkillLoader] Failed to load linkai-agent config: {e}")
return default_description
def load_all_skills(
self,
managed_dir: Optional[str] = None,

View File

@@ -8,6 +8,7 @@ from agent.tools.write.write import Write
from agent.tools.edit.edit import Edit
from agent.tools.bash.bash import Bash
from agent.tools.ls.ls import Ls
from agent.tools.send.send import Send
# Import memory tools
from agent.tools.memory.memory_search import MemorySearchTool
@@ -112,6 +113,7 @@ __all__ = [
'Edit',
'Bash',
'Ls',
'Send',
'MemorySearchTool',
'MemoryGetTool',
'EnvConfig',

View File

@@ -3,12 +3,14 @@ Bash tool - Execute bash commands
"""
import os
import sys
import subprocess
import tempfile
from typing import Dict, Any
from agent.tools.base_tool import BaseTool, ToolResult
from agent.tools.utils.truncate import truncate_tail, format_size, DEFAULT_MAX_LINES, DEFAULT_MAX_BYTES
from common.log import logger
class Bash(BaseTool):
@@ -60,6 +62,12 @@ IMPORTANT SAFETY GUIDELINES:
if not command:
return ToolResult.fail("Error: command parameter is required")
# Security check: Prevent accessing sensitive config files
if "~/.cow/.env" in command or "~/.cow" in command:
return ToolResult.fail(
"Error: Access denied. API keys and credentials must be accessed through the env_config tool only."
)
# Optional safety check - only warn about extremely dangerous commands
if self.safety_mode:
warning = self._get_safety_warning(command)
@@ -68,7 +76,31 @@ IMPORTANT SAFETY GUIDELINES:
f"Safety Warning: {warning}\n\nIf you believe this command is safe and necessary, please ask the user for confirmation first, explaining what the command does and why it's needed.")
try:
# Execute command
# Prepare environment with .env file variables
env = os.environ.copy()
# Load environment variables from ~/.cow/.env if it exists
env_file = os.path.expanduser("~/.cow/.env")
if os.path.exists(env_file):
try:
from dotenv import dotenv_values
env_vars = dotenv_values(env_file)
env.update(env_vars)
logger.debug(f"[Bash] Loaded {len(env_vars)} variables from {env_file}")
except ImportError:
logger.debug("[Bash] python-dotenv not installed, skipping .env loading")
except Exception as e:
logger.debug(f"[Bash] Failed to load .env: {e}")
# Debug logging
logger.debug(f"[Bash] CWD: {self.cwd}")
logger.debug(f"[Bash] Command: {command[:500]}")
logger.debug(f"[Bash] OPENAI_API_KEY in env: {'OPENAI_API_KEY' in env}")
logger.debug(f"[Bash] SHELL: {env.get('SHELL', 'not set')}")
logger.debug(f"[Bash] Python executable: {sys.executable}")
logger.debug(f"[Bash] Process UID: {os.getuid()}")
# Execute command with inherited environment variables
result = subprocess.run(
command,
shell=True,
@@ -76,8 +108,50 @@ IMPORTANT SAFETY GUIDELINES:
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=timeout
timeout=timeout,
env=env
)
logger.debug(f"[Bash] Exit code: {result.returncode}")
logger.debug(f"[Bash] Stdout length: {len(result.stdout)}")
logger.debug(f"[Bash] Stderr length: {len(result.stderr)}")
# Workaround for exit code 126 with no output
if result.returncode == 126 and not result.stdout and not result.stderr:
logger.warning(f"[Bash] Exit 126 with no output - trying alternative execution method")
# Try using argument list instead of shell=True
import shlex
try:
parts = shlex.split(command)
if len(parts) > 0:
logger.info(f"[Bash] Retrying with argument list: {parts[:3]}...")
retry_result = subprocess.run(
parts,
cwd=self.cwd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=timeout,
env=env
)
logger.debug(f"[Bash] Retry exit code: {retry_result.returncode}, stdout: {len(retry_result.stdout)}, stderr: {len(retry_result.stderr)}")
# If retry succeeded, use retry result
if retry_result.returncode == 0 or retry_result.stdout or retry_result.stderr:
result = retry_result
else:
# Both attempts failed - check if this is openai-image-vision skill
if 'openai-image-vision' in command or 'vision.sh' in command:
# Create a mock result with helpful error message
from types import SimpleNamespace
result = SimpleNamespace(
returncode=1,
stdout='{"error": "图片无法解析", "reason": "该图片格式可能不受支持,或图片文件存在问题", "suggestion": "请尝试其他图片"}',
stderr=''
)
logger.info(f"[Bash] Converted exit 126 to user-friendly image error message for vision skill")
except Exception as retry_err:
logger.warning(f"[Bash] Retry failed: {retry_err}")
# Combine stdout and stderr
output = result.stdout

View File

@@ -27,7 +27,7 @@ class EnvConfig(BaseTool):
name: str = "env_config"
description: str = (
"Manage API keys and skill configurations stored in the workspace .env file. "
"Manage API keys and skill configurations securely. "
"Use this tool when user wants to configure API keys (like BOCHA_API_KEY, OPENAI_API_KEY), "
"view configured keys, or manage skill settings. "
"Actions: 'set' (add/update key), 'get' (view specific key), 'list' (show all configured keys), 'delete' (remove key). "
@@ -65,16 +65,17 @@ class EnvConfig(BaseTool):
def __init__(self, config: dict = None):
self.config = config or {}
self.workspace_dir = self.config.get("workspace_dir", os.path.expanduser("~/cow"))
self.env_path = os.path.join(self.workspace_dir, '.env')
# Store env config in ~/.cow directory (outside workspace for security)
self.env_dir = os.path.expanduser("~/.cow")
self.env_path = os.path.join(self.env_dir, '.env')
self.agent_bridge = self.config.get("agent_bridge") # Reference to AgentBridge for hot reload
# Don't create .env file in __init__ to avoid issues during tool discovery
# It will be created on first use in execute()
def _ensure_env_file(self):
"""Ensure the .env file exists"""
# Create workspace directory if it doesn't exist
os.makedirs(self.workspace_dir, exist_ok=True)
# Create ~/.cow directory if it doesn't exist
os.makedirs(self.env_dir, exist_ok=True)
if not os.path.exists(self.env_path):
Path(self.env_path).touch()

View File

@@ -50,6 +50,13 @@ class Ls(BaseTool):
# Resolve path
absolute_path = self._resolve_path(path)
# Security check: Prevent accessing sensitive config directory
env_config_dir = os.path.expanduser("~/.cow")
if os.path.abspath(absolute_path) == os.path.abspath(env_config_dir):
return ToolResult.fail(
"Error: Access denied. API keys and credentials must be accessed through the env_config tool only."
)
if not os.path.exists(absolute_path):
# Provide helpful hint if using relative path
if not os.path.isabs(path) and not path.startswith('~'):

View File

@@ -15,7 +15,7 @@ class Read(BaseTool):
"""Tool for reading file contents"""
name: str = "read"
description: str = f"Read the contents of a file. Supports text files, PDF files, and images (jpg, png, gif, webp). For text files, output is truncated to {DEFAULT_MAX_LINES} lines or {DEFAULT_MAX_BYTES // 1024}KB (whichever is hit first). Use offset/limit for large files."
description: str = f"Read or inspect file contents. For text/PDF files, returns content (truncated to {DEFAULT_MAX_LINES} lines or {DEFAULT_MAX_BYTES // 1024}KB). For images/videos/audio, returns metadata only (file info, size, type). Use offset/limit for large text files."
params: dict = {
"type": "object",
@@ -39,10 +39,25 @@ class Read(BaseTool):
def __init__(self, config: dict = None):
self.config = config or {}
self.cwd = self.config.get("cwd", os.getcwd())
# Supported image formats
self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp'}
# Supported PDF format
# File type categories
self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg', '.ico'}
self.video_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm', '.m4v'}
self.audio_extensions = {'.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac', '.wma'}
self.binary_extensions = {'.exe', '.dll', '.so', '.dylib', '.bin', '.dat', '.db', '.sqlite'}
self.archive_extensions = {'.zip', '.tar', '.gz', '.rar', '.7z', '.bz2', '.xz'}
self.pdf_extensions = {'.pdf'}
# Readable text formats (will be read with truncation)
self.text_extensions = {
'.txt', '.md', '.markdown', '.rst', '.log', '.csv', '.tsv', '.json', '.xml', '.yaml', '.yml',
'.py', '.js', '.ts', '.java', '.c', '.cpp', '.h', '.hpp', '.go', '.rs', '.rb', '.php',
'.html', '.css', '.scss', '.sass', '.less', '.vue', '.jsx', '.tsx',
'.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
'.sql', '.r', '.m', '.swift', '.kt', '.scala', '.clj', '.erl', '.ex',
'.dockerfile', '.makefile', '.cmake', '.gradle', '.properties', '.ini', '.conf', '.cfg',
'.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx' # Office documents
}
def execute(self, args: Dict[str, Any]) -> ToolResult:
"""
@@ -61,6 +76,13 @@ class Read(BaseTool):
# Resolve path
absolute_path = self._resolve_path(path)
# Security check: Prevent reading sensitive config files
env_config_path = os.path.expanduser("~/.cow/.env")
if os.path.abspath(absolute_path) == os.path.abspath(env_config_path):
return ToolResult.fail(
"Error: Access denied. API keys and credentials must be accessed through the env_config tool only."
)
# Check if file exists
if not os.path.exists(absolute_path):
# Provide helpful hint if using relative path
@@ -78,16 +100,25 @@ class Read(BaseTool):
# Check file type
file_ext = Path(absolute_path).suffix.lower()
file_size = os.path.getsize(absolute_path)
# Check if image
# Check if image - return metadata for sending
if file_ext in self.image_extensions:
return self._read_image(absolute_path, file_ext)
# Check if video/audio/binary/archive - return metadata only
if file_ext in self.video_extensions:
return self._return_file_metadata(absolute_path, "video", file_size)
if file_ext in self.audio_extensions:
return self._return_file_metadata(absolute_path, "audio", file_size)
if file_ext in self.binary_extensions or file_ext in self.archive_extensions:
return self._return_file_metadata(absolute_path, "binary", file_size)
# Check if PDF
if file_ext in self.pdf_extensions:
return self._read_pdf(absolute_path, path, offset, limit)
# Read text file
# Read text file (with truncation for large files)
return self._read_text(absolute_path, path, offset, limit)
def _resolve_path(self, path: str) -> str:
@@ -103,25 +134,56 @@ class Read(BaseTool):
return path
return os.path.abspath(os.path.join(self.cwd, path))
def _return_file_metadata(self, absolute_path: str, file_type: str, file_size: int) -> ToolResult:
"""
Return file metadata for non-readable files (video, audio, binary, etc.)
:param absolute_path: Absolute path to the file
:param file_type: Type of file (video, audio, binary, etc.)
:param file_size: File size in bytes
:return: File metadata
"""
file_name = Path(absolute_path).name
file_ext = Path(absolute_path).suffix.lower()
# Determine MIME type
mime_types = {
# Video
'.mp4': 'video/mp4', '.avi': 'video/x-msvideo', '.mov': 'video/quicktime',
'.mkv': 'video/x-matroska', '.webm': 'video/webm',
# Audio
'.mp3': 'audio/mpeg', '.wav': 'audio/wav', '.ogg': 'audio/ogg',
'.m4a': 'audio/mp4', '.flac': 'audio/flac',
# Binary
'.zip': 'application/zip', '.tar': 'application/x-tar',
'.gz': 'application/gzip', '.rar': 'application/x-rar-compressed',
}
mime_type = mime_types.get(file_ext, 'application/octet-stream')
result = {
"type": f"{file_type}_metadata",
"file_type": file_type,
"path": absolute_path,
"file_name": file_name,
"mime_type": mime_type,
"size": file_size,
"size_formatted": format_size(file_size),
"message": f"{file_type.capitalize()} 文件: {file_name} ({format_size(file_size)})\n提示: 如果需要发送此文件,请使用 send 工具。"
}
return ToolResult.success(result)
def _read_image(self, absolute_path: str, file_ext: str) -> ToolResult:
"""
Read image file
Read image file - always return metadata only (images should be sent, not read into context)
:param absolute_path: Absolute path to the image file
:param file_ext: File extension
:return: Result containing image information
:return: Result containing image metadata for sending
"""
try:
# Read image file
with open(absolute_path, 'rb') as f:
image_data = f.read()
# Get file size
file_size = len(image_data)
# Return image information (actual image data can be base64 encoded when needed)
import base64
base64_data = base64.b64encode(image_data).decode('utf-8')
file_size = os.path.getsize(absolute_path)
# Determine MIME type
mime_type_map = {
@@ -133,12 +195,15 @@ class Read(BaseTool):
}
mime_type = mime_type_map.get(file_ext, 'image/jpeg')
# Return metadata for images (NOT file_to_send - use send tool to actually send)
result = {
"type": "image",
"type": "image_metadata",
"file_type": "image",
"path": absolute_path,
"mime_type": mime_type,
"size": file_size,
"size_formatted": format_size(file_size),
"data": base64_data # Base64 encoded image data
"message": f"图片文件: {Path(absolute_path).name} ({format_size(file_size)})\n提示: 如果需要发送此图片,请使用 send 工具。"
}
return ToolResult.success(result)
@@ -157,10 +222,32 @@ class Read(BaseTool):
:return: File content or error message
"""
try:
# Check file size first
file_size = os.path.getsize(absolute_path)
MAX_FILE_SIZE = 50 * 1024 * 1024 # 50MB
if file_size > MAX_FILE_SIZE:
# File too large, return metadata only
return ToolResult.success({
"type": "file_to_send",
"file_type": "document",
"path": absolute_path,
"size": file_size,
"size_formatted": format_size(file_size),
"message": f"文件过大 ({format_size(file_size)} > 50MB),无法读取内容。文件路径: {absolute_path}"
})
# Read file
with open(absolute_path, 'r', encoding='utf-8') as f:
content = f.read()
# Truncate content if too long (20K characters max for model context)
MAX_CONTENT_CHARS = 20 * 1024 # 20K characters
content_truncated = False
if len(content) > MAX_CONTENT_CHARS:
content = content[:MAX_CONTENT_CHARS]
content_truncated = True
all_lines = content.split('\n')
total_file_lines = len(all_lines)
@@ -197,6 +284,10 @@ class Read(BaseTool):
output_text = ""
details = {}
# Add truncation warning if content was truncated
if content_truncated:
output_text = f"[文件内容已截断到前 {format_size(MAX_CONTENT_CHARS)},完整文件大小: {format_size(file_size)}]\n\n"
if truncation.first_line_exceeds_limit:
# First line exceeds 30KB limit
first_line_size = format_size(len(all_lines[start_line].encode('utf-8')))

View File

@@ -42,24 +42,26 @@ Agent: [调用 scheduler 工具]
**示例对话:**
```
用户: 每天早上8点帮我搜索一下当前新闻
用户: 每天早上8点帮我读取一下今日日程
Agent: [调用 scheduler 工具]
action: create
name: 每日新闻
name: 每日日程
tool_call:
tool_name: bocha_search
tool_name: read
tool_params:
query: 今日新闻
result_prefix: 📰 今日新闻播报
file_path: ~/cow/schedule.txt
result_prefix: 📅 今日日程
schedule_type: cron
schedule_value: 0 8 * * *
```
**工具调用参数说明:**
- `tool_name`: 要调用的工具名称(如 `bocha_search``web_fetch`
- `tool_name`: 要调用的工具名称(如 `bash``read``write` 等内置工具
- `tool_params`: 工具的参数(字典格式)
- `result_prefix`: 可选,在结果前添加的前缀文本
**注意:** 如果要使用 skills如 bocha-search需要通过 `bash` 工具调用 skill 脚本
### 2. 支持的调度类型
#### Cron 表达式 (`cron`)
@@ -167,7 +169,7 @@ Agent: [调用 scheduler 工具]
```json
{
"id": "def456",
"name": "每日新闻",
"name": "每日日程",
"enabled": true,
"created_at": "2024-01-01T10:00:00",
"updated_at": "2024-01-01T10:00:00",
@@ -177,11 +179,11 @@ Agent: [调用 scheduler 工具]
},
"action": {
"type": "tool_call",
"tool_name": "bocha_search",
"tool_name": "read",
"tool_params": {
"query": "今日新闻"
"file_path": "~/cow/schedule.txt"
},
"result_prefix": "📰 今日新闻播报",
"result_prefix": "📅 今日日程",
"receiver": "wxid_xxx",
"receiver_name": "张三",
"is_group": false,
@@ -234,30 +236,29 @@ Agent: [创建 cron: 0 18 * * 1-5]
Agent: [创建 interval: 3600]
```
### 4. 每日新闻推送(动态工具调用)
### 4. 每日日程推送(动态工具调用)
```
用户: 每天早上8点帮我搜索一下当前新闻
用户: 每天早上8点帮我读取今日日程
Agent: ✅ 定时任务创建成功
任务ID: news001
任务ID: schedule001
调度: 每天 8:00
工具: bocha_search(query='今日新闻')
前缀: 📰 今日新闻播报
工具: read(file_path='~/cow/schedule.txt')
前缀: 📅 今日日程
```
### 5. 定时天气查询(动态工具调用)
### 5. 定时文件备份(动态工具调用)
```
用户: 每天早上7点查询今天的天气
Agent: [创建 cron: 0 7 * * *]
工具: bocha_search(query='今日天气')
前缀: 🌤️ 今日天气预报
用户: 每天晚上11点备份工作文件
Agent: [创建 cron: 0 23 * * *]
工具: bash(command='cp ~/cow/work.txt ~/cow/backup/work_$(date +%Y%m%d).txt')
前缀: ✅ 文件已备份
```
### 6. 周报提醒(动态工具调用
### 6. 周报提醒(静态消息
```
用户: 每周五下午5点搜索本周热点
用户: 每周五下午5点提醒我写周报
Agent: [创建 cron: 0 17 * * 5]
工具: bocha_search(query='本周热点新闻')
前缀: 📊 本周热点回顾
消息: 📊 该写周报了!
```
### 4. 特定日期提醒

View File

@@ -45,10 +45,17 @@ def init_scheduler(agent_bridge) -> bool:
action = task.get("action", {})
action_type = action.get("type")
if action_type == "send_message":
if action_type == "agent_task":
_execute_agent_task(task, agent_bridge)
elif action_type == "send_message":
# Legacy support for old tasks
_execute_send_message(task, agent_bridge)
elif action_type == "tool_call":
# Legacy support for old tasks
_execute_tool_call(task, agent_bridge)
elif action_type == "skill_call":
# Legacy support for old tasks
_execute_skill_call(task, agent_bridge)
else:
logger.warning(f"[Scheduler] Unknown action type: {action_type}")
except Exception as e:
@@ -76,6 +83,100 @@ def get_scheduler_service():
return _scheduler_service
def _execute_agent_task(task: dict, agent_bridge):
"""
Execute an agent_task action - let Agent handle the task
Args:
task: Task dictionary
agent_bridge: AgentBridge instance
"""
try:
action = task.get("action", {})
task_description = action.get("task_description")
receiver = action.get("receiver")
is_group = action.get("is_group", False)
channel_type = action.get("channel_type", "unknown")
if not task_description:
logger.error(f"[Scheduler] Task {task['id']}: No task_description specified")
return
if not receiver:
logger.error(f"[Scheduler] Task {task['id']}: No receiver specified")
return
# Check for unsupported channels
if channel_type == "dingtalk":
logger.warning(f"[Scheduler] Task {task['id']}: DingTalk channel does not support scheduled messages (Stream mode limitation). Task will execute but message cannot be sent.")
logger.info(f"[Scheduler] Task {task['id']}: Executing agent task '{task_description}'")
# Create context for Agent
context = Context(ContextType.TEXT, task_description)
context["receiver"] = receiver
context["isgroup"] = is_group
context["session_id"] = receiver
# Channel-specific setup
if channel_type == "web":
import uuid
request_id = f"scheduler_{task['id']}_{uuid.uuid4().hex[:8]}"
context["request_id"] = request_id
elif channel_type == "feishu":
context["receive_id_type"] = "chat_id" if is_group else "open_id"
context["msg"] = None
elif channel_type == "dingtalk":
# DingTalk requires msg object, set to None for scheduled tasks
context["msg"] = None
# 如果是单聊,需要传递 sender_staff_id
if not is_group:
sender_staff_id = action.get("dingtalk_sender_staff_id")
if sender_staff_id:
context["dingtalk_sender_staff_id"] = sender_staff_id
# Use Agent to execute the task
# Mark this as a scheduled task execution to prevent recursive task creation
context["is_scheduled_task"] = True
try:
reply = agent_bridge.agent_reply(task_description, context=context, on_event=None, clear_history=True)
if reply and reply.content:
# Send the reply via channel
from channel.channel_factory import create_channel
try:
channel = create_channel(channel_type)
if channel:
# For web channel, register request_id
if channel_type == "web" and hasattr(channel, 'request_to_session'):
request_id = context.get("request_id")
if request_id:
channel.request_to_session[request_id] = receiver
logger.debug(f"[Scheduler] Registered request_id {request_id} -> session {receiver}")
# Send the reply
channel.send(reply, context)
logger.info(f"[Scheduler] Task {task['id']} executed successfully, result sent to {receiver}")
else:
logger.error(f"[Scheduler] Failed to create channel: {channel_type}")
except Exception as e:
logger.error(f"[Scheduler] Failed to send result: {e}")
else:
logger.error(f"[Scheduler] Task {task['id']}: No result from agent execution")
except Exception as e:
logger.error(f"[Scheduler] Failed to execute task via Agent: {e}")
import traceback
logger.error(f"[Scheduler] Traceback: {traceback.format_exc()}")
except Exception as e:
logger.error(f"[Scheduler] Error in _execute_agent_task: {e}")
import traceback
logger.error(f"[Scheduler] Traceback: {traceback.format_exc()}")
def _execute_send_message(task: dict, agent_bridge):
"""
Execute a send_message action
@@ -116,6 +217,17 @@ def _execute_send_message(task: dict, agent_bridge):
# Feishu channel will detect this and send as new message instead of reply
context["msg"] = None
logger.debug(f"[Scheduler] Feishu: receive_id_type={context['receive_id_type']}, is_group={is_group}, receiver={receiver}")
elif channel_type == "dingtalk":
# DingTalk channel setup
context["msg"] = None
# 如果是单聊,需要传递 sender_staff_id
if not is_group:
sender_staff_id = action.get("dingtalk_sender_staff_id")
if sender_staff_id:
context["dingtalk_sender_staff_id"] = sender_staff_id
logger.debug(f"[Scheduler] DingTalk single chat: sender_staff_id={sender_staff_id}")
else:
logger.warning(f"[Scheduler] Task {task['id']}: DingTalk single chat message missing sender_staff_id")
# Create reply
reply = Reply(ReplyType.TEXT, content)
@@ -156,8 +268,9 @@ def _execute_tool_call(task: dict, agent_bridge):
"""
try:
action = task.get("action", {})
tool_name = action.get("tool_name")
tool_params = action.get("tool_params", {})
# Support both old and new field names
tool_name = action.get("call_name") or action.get("tool_name")
tool_params = action.get("call_params") or action.get("tool_params", {})
result_prefix = action.get("result_prefix", "")
receiver = action.get("receiver")
is_group = action.get("is_group", False)
@@ -237,6 +350,82 @@ def _execute_tool_call(task: dict, agent_bridge):
logger.error(f"[Scheduler] Error in _execute_tool_call: {e}")
def _execute_skill_call(task: dict, agent_bridge):
"""
Execute a skill_call action by asking Agent to run the skill
Args:
task: Task dictionary
agent_bridge: AgentBridge instance
"""
try:
action = task.get("action", {})
# Support both old and new field names
skill_name = action.get("call_name") or action.get("skill_name")
skill_params = action.get("call_params") or action.get("skill_params", {})
result_prefix = action.get("result_prefix", "")
receiver = action.get("receiver")
is_group = action.get("isgroup", False)
channel_type = action.get("channel_type", "unknown")
if not skill_name:
logger.error(f"[Scheduler] Task {task['id']}: No skill_name specified")
return
if not receiver:
logger.error(f"[Scheduler] Task {task['id']}: No receiver specified")
return
logger.info(f"[Scheduler] Task {task['id']}: Executing skill '{skill_name}' with params {skill_params}")
# Build a natural language query for the Agent to execute the skill
# Format: "Use skill-name to do something with params"
param_str = ", ".join([f"{k}={v}" for k, v in skill_params.items()])
query = f"Use {skill_name} skill"
if param_str:
query += f" with {param_str}"
# Create context for Agent
context = Context(ContextType.TEXT, query)
context["receiver"] = receiver
context["isgroup"] = is_group
context["session_id"] = receiver
# Channel-specific setup
if channel_type == "web":
import uuid
request_id = f"scheduler_{task['id']}_{uuid.uuid4().hex[:8]}"
context["request_id"] = request_id
elif channel_type == "feishu":
context["receive_id_type"] = "chat_id" if is_group else "open_id"
context["msg"] = None
# Use Agent to execute the skill
try:
reply = agent_bridge.agent_reply(query, context=context, on_event=None, clear_history=True)
if reply and reply.content:
content = reply.content
# Add prefix if specified
if result_prefix:
content = f"{result_prefix}\n\n{content}"
logger.info(f"[Scheduler] Task {task['id']} executed: skill result sent to {receiver}")
else:
logger.error(f"[Scheduler] Task {task['id']}: No result from skill execution")
except Exception as e:
logger.error(f"[Scheduler] Failed to execute skill via Agent: {e}")
import traceback
logger.error(f"[Scheduler] Traceback: {traceback.format_exc()}")
except Exception as e:
logger.error(f"[Scheduler] Error in _execute_skill_call: {e}")
import traceback
logger.error(f"[Scheduler] Traceback: {traceback.format_exc()}")
def attach_scheduler_to_tool(tool, context: Context = None):
"""
Attach scheduler components to a SchedulerTool instance

View File

@@ -118,6 +118,34 @@ class SchedulerService:
try:
next_run = datetime.fromisoformat(next_run_str)
# Check if task is overdue (e.g., service restart)
if next_run < now:
time_diff = (now - next_run).total_seconds()
# If overdue by more than 5 minutes, skip this run and schedule next
if time_diff > 300: # 5 minutes
logger.warning(f"[Scheduler] Task {task['id']} is overdue by {int(time_diff)}s, skipping and scheduling next run")
# For one-time tasks, disable them
schedule = task.get("schedule", {})
if schedule.get("type") == "once":
self.task_store.update_task(task['id'], {
"enabled": False,
"last_run_at": now.isoformat()
})
logger.info(f"[Scheduler] One-time task {task['id']} expired, disabled")
return False
# For recurring tasks, calculate next run from now
next_next_run = self._calculate_next_run(task, now)
if next_next_run:
self.task_store.update_task(task['id'], {
"next_run_at": next_next_run.isoformat()
})
logger.info(f"[Scheduler] Rescheduled task {task['id']} to {next_next_run}")
return False
return now >= next_run
except:
return False

View File

@@ -20,23 +20,16 @@ class SchedulerTool(BaseTool):
name: str = "scheduler"
description: str = (
"创建、查询和管理定时任务。支持两种任务类型:\n"
"1. 静态消息任务:定时发送预定义的消息\n"
"2. 动态工具任务:定时执行工具调用并发送结果(如搜索新闻、查询天气等)\n\n"
"创建、查询和管理定时任务。支持固定消息和AI任务两种类型。\n\n"
"使用方法:\n"
"- 创建静态消息任务action='create', name='任务名', message='消息内容', schedule_type='interval'/'cron'/'once', schedule_value='间隔秒数/cron表达式/时间'\n"
"- 创建动态工具任务action='create', name='任务名', tool_call={'tool_name': '工具名', 'tool_params': {...}, 'result_prefix': '前缀'}, schedule_type='interval'/'cron'/'once', schedule_value=''\n"
"- 查询列表action='list'\n"
"- 查看详情action='get', task_id='任务ID'\n"
"- 删除任务action='delete', task_id='任务ID'\n"
"- 启用任务action='enable', task_id='任务ID'\n"
"- 禁用任务action='disable', task_id='任务ID'\n\n"
"调度类型说明:\n"
"- interval: 固定间隔秒数如3600表示每小时\n"
"- cron: cron表达式'0 9 * * *'表示每天9点'*/10 * * * *'表示每10分钟\n"
"- once: 一次性任务ISO时间格式'2024-12-25T09:00:00'\n\n"
"示例每天早上8点搜索新闻\n"
"action='create', name='每日新闻', tool_call={'tool_name': 'bocha_search', 'tool_params': {'query': '今日新闻'}, 'result_prefix': '📰 今日新闻播报'}, schedule_type='cron', schedule_value='0 8 * * *'"
"- 创建action='create', name='任务名', message/ai_task='内容', schedule_type='once/interval/cron', schedule_value='...'\n"
"- 查询action='list' / action='get', task_id='任务ID'\n"
"- 管理action='delete/enable/disable', task_id='任务ID'\n\n"
"调度类型:\n"
"- once: 一次性任务,支持相对时间(+5s,+10m,+1h,+1d)或ISO时间\n"
"- interval: 固定间隔(秒)如3600表示每小时\n"
"- cron: cron表达式'0 8 * * *'表示每天8点\n\n"
"注意:'X秒后'用once+相对时间,'每X秒'用interval"
)
params: dict = {
"type": "object",
@@ -56,26 +49,11 @@ class SchedulerTool(BaseTool):
},
"message": {
"type": "string",
"description": "要发送的静态消息内容 (用于 create 操作与tool_call二选一)"
"description": "固定消息内容 (与ai_task二选一)"
},
"tool_call": {
"type": "object",
"description": "要执行的工具调用 (用于 create 操作与message二选一)",
"properties": {
"tool_name": {
"type": "string",
"description": "工具名称,如 'bocha_search'"
},
"tool_params": {
"type": "object",
"description": "工具参数"
},
"result_prefix": {
"type": "string",
"description": "结果前缀,如 '今日新闻:'"
}
},
"required": ["tool_name"]
"ai_task": {
"type": "string",
"description": "AI任务描述 (与message二选一),如'搜索今日新闻''查询天气'"
},
"schedule_type": {
"type": "string",
@@ -84,12 +62,7 @@ class SchedulerTool(BaseTool):
},
"schedule_value": {
"type": "string",
"description": (
"调度值 (用于 create 操作):\n"
"- cron类型: cron表达式'0 9 * * *' (每天9点)'*/10 * * * *' (每10分钟)\n"
"- interval类型: 间隔秒数,如 '3600' (每小时)'10' (每10秒)\n"
"- once类型: ISO时间'2024-12-25T09:00:00'"
)
"description": "调度值: cron表达式/间隔秒数/时间(+5s,+10m,+1h或ISO格式)"
}
},
"required": ["action"]
@@ -151,17 +124,20 @@ class SchedulerTool(BaseTool):
"""Create a new scheduled task"""
name = kwargs.get("name")
message = kwargs.get("message")
tool_call = kwargs.get("tool_call")
ai_task = kwargs.get("ai_task")
schedule_type = kwargs.get("schedule_type")
schedule_value = kwargs.get("schedule_value")
# Validate required fields
if not name:
return "错误: 缺少任务名称 (name)"
if not message and not tool_call:
return "错误: 必须提供 message 或 tool_call 之一"
if message and tool_call:
return "错误: message 和 tool_call 不能同时提供,请选择其"
# Check that exactly one of message/ai_task is provided
if not message and not ai_task:
return "错误: 必须提供 message(固定消息)或 ai_taskAI任务"
if message and ai_task:
return "错误: message 和 ai_task 只能提供其中一个"
if not schedule_type:
return "错误: 缺少调度类型 (schedule_type)"
if not schedule_value:
@@ -181,7 +157,7 @@ class SchedulerTool(BaseTool):
# Create task
task_id = str(uuid.uuid4())[:8]
# Build action based on message or tool_call
# Build action based on message or ai_task
if message:
action = {
"type": "send_message",
@@ -191,19 +167,22 @@ class SchedulerTool(BaseTool):
"is_group": context.get("isgroup", False),
"channel_type": self.config.get("channel_type", "unknown")
}
else: # tool_call
else: # ai_task
action = {
"type": "tool_call",
"tool_name": tool_call.get("tool_name"),
"tool_params": tool_call.get("tool_params", {}),
"result_prefix": tool_call.get("result_prefix", ""),
"type": "agent_task",
"task_description": ai_task,
"receiver": context.get("receiver"),
"receiver_name": self._get_receiver_name(context),
"is_group": context.get("isgroup", False),
"channel_type": self.config.get("channel_type", "unknown")
}
task = {
# 针对钉钉单聊,额外存储 sender_staff_id
msg = context.kwargs.get("msg")
if msg and hasattr(msg, 'sender_staff_id') and not context.get("isgroup", False):
action["dingtalk_sender_staff_id"] = msg.sender_staff_id
task_data = {
"id": task_id,
"name": name,
"enabled": True,
@@ -214,26 +193,21 @@ class SchedulerTool(BaseTool):
}
# Calculate initial next_run_at
next_run = self._calculate_next_run(task)
next_run = self._calculate_next_run(task_data)
if next_run:
task["next_run_at"] = next_run.isoformat()
task_data["next_run_at"] = next_run.isoformat()
# Save task
self.task_store.add_task(task)
self.task_store.add_task(task_data)
# Format response
schedule_desc = self._format_schedule_description(schedule)
receiver_desc = task["action"]["receiver_name"] or task["action"]["receiver"]
receiver_desc = task_data["action"]["receiver_name"] or task_data["action"]["receiver"]
if message:
content_desc = f"💬 消息: {message}"
content_desc = f"💬 固定消息: {message}"
else:
tool_name = tool_call.get("tool_name")
tool_params_str = str(tool_call.get("tool_params", {}))
prefix = tool_call.get("result_prefix", "")
content_desc = f"🔧 工具调用: {tool_name}({tool_params_str})"
if prefix:
content_desc += f"\n📝 结果前缀: {prefix}"
content_desc = f"🤖 AI任务: {ai_task}"
return (
f"✅ 定时任务创建成功\n\n"
@@ -353,9 +327,38 @@ class SchedulerTool(BaseTool):
return {"type": "interval", "seconds": seconds}
elif schedule_type == "once":
# Parse datetime
datetime.fromisoformat(schedule_value)
return {"type": "once", "run_at": schedule_value}
# Parse datetime - support both relative and absolute time
# Check if it's relative time (e.g., "+5s", "+10m", "+1h", "+1d")
if schedule_value.startswith("+"):
import re
match = re.match(r'\+(\d+)([smhd])', schedule_value)
if match:
amount = int(match.group(1))
unit = match.group(2)
from datetime import timedelta
now = datetime.now()
if unit == 's': # seconds
target_time = now + timedelta(seconds=amount)
elif unit == 'm': # minutes
target_time = now + timedelta(minutes=amount)
elif unit == 'h': # hours
target_time = now + timedelta(hours=amount)
elif unit == 'd': # days
target_time = now + timedelta(days=amount)
else:
return None
return {"type": "once", "run_at": target_time.isoformat()}
else:
logger.error(f"[SchedulerTool] Invalid relative time format: {schedule_value}")
return None
else:
# Absolute time in ISO format
datetime.fromisoformat(schedule_value)
return {"type": "once", "run_at": schedule_value}
except Exception as e:
logger.error(f"[SchedulerTool] Invalid schedule: {e}")

View File

@@ -0,0 +1,3 @@
from .send import Send
__all__ = ['Send']

159
agent/tools/send/send.py Normal file
View File

@@ -0,0 +1,159 @@
"""
Send tool - Send files to the user
"""
import os
from typing import Dict, Any
from pathlib import Path
from agent.tools.base_tool import BaseTool, ToolResult
class Send(BaseTool):
"""Tool for sending files to the user"""
name: str = "send"
description: str = "Send a file (image, video, audio, document) to the user. Use this when the user explicitly asks to send/share a file."
params: dict = {
"type": "object",
"properties": {
"path": {
"type": "string",
"description": "Path to the file to send. Can be absolute path or relative to workspace."
},
"message": {
"type": "string",
"description": "Optional message to accompany the file"
}
},
"required": ["path"]
}
def __init__(self, config: dict = None):
self.config = config or {}
self.cwd = self.config.get("cwd", os.getcwd())
# Supported file types
self.image_extensions = {'.jpg', '.jpeg', '.png', '.gif', '.webp', '.bmp', '.svg', '.ico'}
self.video_extensions = {'.mp4', '.avi', '.mov', '.mkv', '.flv', '.wmv', '.webm', '.m4v'}
self.audio_extensions = {'.mp3', '.wav', '.ogg', '.m4a', '.flac', '.aac', '.wma'}
self.document_extensions = {'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.txt', '.md'}
def execute(self, args: Dict[str, Any]) -> ToolResult:
"""
Execute file send operation
:param args: Contains file path and optional message
:return: File metadata for channel to send
"""
path = args.get("path", "").strip()
message = args.get("message", "")
if not path:
return ToolResult.fail("Error: path parameter is required")
# Resolve path
absolute_path = self._resolve_path(path)
# Check if file exists
if not os.path.exists(absolute_path):
return ToolResult.fail(f"Error: File not found: {path}")
# Check if readable
if not os.access(absolute_path, os.R_OK):
return ToolResult.fail(f"Error: File is not readable: {path}")
# Get file info
file_ext = Path(absolute_path).suffix.lower()
file_size = os.path.getsize(absolute_path)
file_name = Path(absolute_path).name
# Determine file type
if file_ext in self.image_extensions:
file_type = "image"
mime_type = self._get_image_mime_type(file_ext)
elif file_ext in self.video_extensions:
file_type = "video"
mime_type = self._get_video_mime_type(file_ext)
elif file_ext in self.audio_extensions:
file_type = "audio"
mime_type = self._get_audio_mime_type(file_ext)
elif file_ext in self.document_extensions:
file_type = "document"
mime_type = self._get_document_mime_type(file_ext)
else:
file_type = "file"
mime_type = "application/octet-stream"
# Return file_to_send metadata
result = {
"type": "file_to_send",
"file_type": file_type,
"path": absolute_path,
"file_name": file_name,
"mime_type": mime_type,
"size": file_size,
"size_formatted": self._format_size(file_size),
"message": message or f"正在发送 {file_name}"
}
return ToolResult.success(result)
def _resolve_path(self, path: str) -> str:
"""Resolve path to absolute path"""
path = os.path.expanduser(path)
if os.path.isabs(path):
return path
return os.path.abspath(os.path.join(self.cwd, path))
def _get_image_mime_type(self, ext: str) -> str:
"""Get MIME type for image"""
mime_map = {
'.jpg': 'image/jpeg', '.jpeg': 'image/jpeg',
'.png': 'image/png', '.gif': 'image/gif',
'.webp': 'image/webp', '.bmp': 'image/bmp',
'.svg': 'image/svg+xml', '.ico': 'image/x-icon'
}
return mime_map.get(ext, 'image/jpeg')
def _get_video_mime_type(self, ext: str) -> str:
"""Get MIME type for video"""
mime_map = {
'.mp4': 'video/mp4', '.avi': 'video/x-msvideo',
'.mov': 'video/quicktime', '.mkv': 'video/x-matroska',
'.webm': 'video/webm', '.flv': 'video/x-flv'
}
return mime_map.get(ext, 'video/mp4')
def _get_audio_mime_type(self, ext: str) -> str:
"""Get MIME type for audio"""
mime_map = {
'.mp3': 'audio/mpeg', '.wav': 'audio/wav',
'.ogg': 'audio/ogg', '.m4a': 'audio/mp4',
'.flac': 'audio/flac', '.aac': 'audio/aac'
}
return mime_map.get(ext, 'audio/mpeg')
def _get_document_mime_type(self, ext: str) -> str:
"""Get MIME type for document"""
mime_map = {
'.pdf': 'application/pdf',
'.doc': 'application/msword',
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'.xls': 'application/vnd.ms-excel',
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'.ppt': 'application/vnd.ms-powerpoint',
'.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'.txt': 'text/plain',
'.md': 'text/markdown'
}
return mime_map.get(ext, 'application/octet-stream')
def _format_size(self, size_bytes: int) -> str:
"""Format file size in human-readable format"""
for unit in ['B', 'KB', 'MB', 'GB']:
if size_bytes < 1024.0:
return f"{size_bytes:.1f}{unit}"
size_bytes /= 1024.0
return f"{size_bytes:.1f}TB"

View File

@@ -2,6 +2,7 @@
Agent Bridge - Integrates Agent system with existing COW bridge
"""
import os
from typing import Optional, List
from agent.protocol import Agent, LLMModel, LLMRequest
@@ -269,8 +270,11 @@ class AgentBridge:
# Get workspace from config
workspace_root = os.path.expanduser(conf().get("agent_workspace", "~/cow"))
# Load environment variables from workspace .env file
env_file = os.path.join(workspace_root, '.env')
# Migrate API keys from config.json to environment variables (if not already set)
self._migrate_config_to_env(workspace_root)
# Load environment variables from secure .env file location
env_file = os.path.expanduser("~/.cow/.env")
if os.path.exists(env_file):
try:
from dotenv import load_dotenv
@@ -280,9 +284,6 @@ class AgentBridge:
logger.warning("[AgentBridge] python-dotenv not installed, skipping .env file loading")
except Exception as e:
logger.warning(f"[AgentBridge] Failed to load .env file: {e}")
# Migrate API keys from config.json to environment variables (if not already set)
self._migrate_config_to_env(workspace_root)
# Initialize workspace and create template files
from agent.prompt import ensure_workspace, load_context_files, PromptBuilder
@@ -377,7 +378,6 @@ class AgentBridge:
if tool_name == "env_config":
from agent.tools import EnvConfig
tool = EnvConfig({
"workspace_dir": workspace_root,
"agent_bridge": self # Pass self reference for hot reload
})
else:
@@ -390,12 +390,6 @@ class AgentBridge:
tool.cwd = file_config.get("cwd", tool.cwd if hasattr(tool, 'cwd') else None)
if 'memory_manager' in file_config:
tool.memory_manager = file_config['memory_manager']
# Apply API key for bocha_search tool
elif tool_name == 'bocha_search':
bocha_api_key = conf().get("bocha_api_key", "")
if bocha_api_key:
tool.config = {"bocha_api_key": bocha_api_key}
tool.api_key = bocha_api_key
tools.append(tool)
logger.debug(f"[AgentBridge] Loaded tool: {tool_name}")
except Exception as e:
@@ -504,8 +498,11 @@ class AgentBridge:
# Get workspace from config
workspace_root = os.path.expanduser(conf().get("agent_workspace", "~/cow"))
# Load environment variables from workspace .env file
env_file = os.path.join(workspace_root, '.env')
# Migrate API keys from config.json to environment variables (if not already set)
self._migrate_config_to_env(workspace_root)
# Load environment variables from secure .env file location
env_file = os.path.expanduser("~/.cow/.env")
if os.path.exists(env_file):
try:
from dotenv import load_dotenv
@@ -609,11 +606,6 @@ class AgentBridge:
tool.cwd = file_config.get("cwd", tool.cwd if hasattr(tool, 'cwd') else None)
if 'memory_manager' in file_config:
tool.memory_manager = file_config['memory_manager']
elif tool_name == 'bocha_search':
bocha_api_key = conf().get("bocha_api_key", "")
if bocha_api_key:
tool.config = {"bocha_api_key": bocha_api_key}
tool.api_key = bocha_api_key
tools.append(tool)
except Exception as e:
logger.warning(f"[AgentBridge] Failed to load tool {tool_name} for session {session_id}: {e}")
@@ -767,23 +759,52 @@ class AgentBridge:
if not agent:
return Reply(ReplyType.ERROR, "Failed to initialize super agent")
# Attach context to scheduler tool if present
if context and agent.tools:
for tool in agent.tools:
if tool.name == "scheduler":
try:
from agent.tools.scheduler.integration import attach_scheduler_to_tool
attach_scheduler_to_tool(tool, context)
except Exception as e:
logger.warning(f"[AgentBridge] Failed to attach context to scheduler: {e}")
break
# Filter tools based on context
original_tools = agent.tools
filtered_tools = original_tools
# Use agent's run_stream method
response = agent.run_stream(
user_message=query,
on_event=on_event,
clear_history=clear_history
)
# If this is a scheduled task execution, exclude scheduler tool to prevent recursion
if context and context.get("is_scheduled_task"):
filtered_tools = [tool for tool in agent.tools if tool.name != "scheduler"]
agent.tools = filtered_tools
logger.info(f"[AgentBridge] Scheduled task execution: excluded scheduler tool ({len(filtered_tools)}/{len(original_tools)} tools)")
else:
# Attach context to scheduler tool if present
if context and agent.tools:
for tool in agent.tools:
if tool.name == "scheduler":
try:
from agent.tools.scheduler.integration import attach_scheduler_to_tool
attach_scheduler_to_tool(tool, context)
except Exception as e:
logger.warning(f"[AgentBridge] Failed to attach context to scheduler: {e}")
break
try:
# Use agent's run_stream method
response = agent.run_stream(
user_message=query,
on_event=on_event,
clear_history=clear_history
)
finally:
# Restore original tools
if context and context.get("is_scheduled_task"):
agent.tools = original_tools
# Check if there are files to send (from read tool)
if hasattr(agent, 'stream_executor') and hasattr(agent.stream_executor, 'files_to_send'):
files_to_send = agent.stream_executor.files_to_send
if files_to_send:
# Send the first file (for now, handle one file at a time)
file_info = files_to_send[0]
logger.info(f"[AgentBridge] Sending file: {file_info.get('path')}")
# Clear files_to_send for next request
agent.stream_executor.files_to_send = []
# Return file reply based on file type
return self._create_file_reply(file_info, response, context)
return Reply(ReplyType.TEXT, response)
@@ -791,12 +812,53 @@ class AgentBridge:
logger.error(f"Agent reply error: {e}")
return Reply(ReplyType.ERROR, f"Agent error: {str(e)}")
def _create_file_reply(self, file_info: dict, text_response: str, context: Context = None) -> Reply:
"""
Create a reply for sending files
Args:
file_info: File metadata from read tool
text_response: Text response from agent
context: Context object
Returns:
Reply object for file sending
"""
file_type = file_info.get("file_type", "file")
file_path = file_info.get("path")
# For images, use IMAGE_URL type (channel will handle upload)
if file_type == "image":
# Convert local path to file:// URL for channel processing
file_url = f"file://{file_path}"
logger.info(f"[AgentBridge] Sending image: {file_url}")
reply = Reply(ReplyType.IMAGE_URL, file_url)
# Attach text message if present (for channels that support text+image)
if text_response:
reply.text_content = text_response # Store accompanying text
return reply
# For documents (PDF, Excel, Word, PPT), use FILE type
if file_type == "document":
file_url = f"file://{file_path}"
logger.info(f"[AgentBridge] Sending document: {file_url}")
reply = Reply(ReplyType.FILE, file_url)
reply.file_name = file_info.get("file_name", os.path.basename(file_path))
return reply
# For other files (video, audio), we need channel-specific handling
# For now, return text with file info
# TODO: Implement video/audio sending when channel supports it
message = text_response or file_info.get("message", "文件已准备")
message += f"\n\n[文件: {file_info.get('file_name', file_path)}]"
return Reply(ReplyType.TEXT, message)
def _migrate_config_to_env(self, workspace_root: str):
"""
Migrate API keys from config.json to .env file if not already set
Args:
workspace_root: Workspace directory path
workspace_root: Workspace directory path (not used, kept for compatibility)
"""
from config import conf
import os
@@ -810,7 +872,8 @@ class AgentBridge:
"linkai_api_key": "LINKAI_API_KEY",
}
env_file = os.path.join(workspace_root, '.env')
# Use fixed secure location for .env file
env_file = os.path.expanduser("~/.cow/.env")
# Read existing env vars from .env file
existing_env_vars = {}
@@ -830,19 +893,25 @@ class AgentBridge:
for config_key, env_key in key_mapping.items():
# Skip if already in .env file
if env_key in existing_env_vars:
logger.debug(f"[AgentBridge] Skipping {env_key} - already in .env")
continue
# Get value from config.json
value = conf().get(config_key, "")
if value and value.strip(): # Only migrate non-empty values
keys_to_migrate[env_key] = value.strip()
logger.debug(f"[AgentBridge] Will migrate {env_key} from config.json")
else:
logger.debug(f"[AgentBridge] Skipping {env_key} - no value in config.json")
# Write new keys to .env file
if keys_to_migrate:
try:
# Ensure .env file exists
# Ensure ~/.cow directory and .env file exist
env_dir = os.path.dirname(env_file)
if not os.path.exists(env_dir):
os.makedirs(env_dir, exist_ok=True)
if not os.path.exists(env_file):
os.makedirs(os.path.dirname(env_file), exist_ok=True)
open(env_file, 'a').close()
# Append new keys

View File

@@ -64,15 +64,22 @@ class ChatChannel(Channel):
check_contain(group_name, group_name_keyword_white_list),
]
):
group_chat_in_one_session = conf().get("group_chat_in_one_session", [])
session_id = cmsg.actual_user_id
if any(
[
group_name in group_chat_in_one_session,
"ALL_GROUP" in group_chat_in_one_session,
]
):
# Check global group_shared_session config first
group_shared_session = conf().get("group_shared_session", True)
if group_shared_session:
# All users in the group share the same session
session_id = group_id
else:
# Check group-specific whitelist (legacy behavior)
group_chat_in_one_session = conf().get("group_chat_in_one_session", [])
session_id = cmsg.actual_user_id
if any(
[
group_name in group_chat_in_one_session,
"ALL_GROUP" in group_chat_in_one_session,
]
):
session_id = group_id
else:
logger.debug(f"No need reply, groupName not in whitelist, group_name={group_name}")
return None
@@ -283,7 +290,98 @@ class ChatChannel(Channel):
reply = e_context["reply"]
if not e_context.is_pass() and reply and reply.type:
logger.debug("[chat_channel] ready to send reply: {}, context: {}".format(reply, context))
self._send(reply, context)
# 如果是文本回复,尝试提取并发送图片
if reply.type == ReplyType.TEXT:
self._extract_and_send_images(reply, context)
# 如果是图片回复但带有文本内容,先发文本再发图片
elif reply.type == ReplyType.IMAGE_URL and hasattr(reply, 'text_content') and reply.text_content:
# 先发送文本
text_reply = Reply(ReplyType.TEXT, reply.text_content)
self._send(text_reply, context)
# 短暂延迟后发送图片
time.sleep(0.3)
self._send(reply, context)
else:
self._send(reply, context)
def _extract_and_send_images(self, reply: Reply, context: Context):
"""
从文本回复中提取图片/视频URL并单独发送
支持格式:[图片: /path/to/image.png], [视频: /path/to/video.mp4], ![](url), <img src="url">
最多发送5个媒体文件
"""
content = reply.content
media_items = [] # [(url, type), ...]
# 正则提取各种格式的媒体URL
patterns = [
(r'\[图片:\s*([^\]]+)\]', 'image'), # [图片: /path/to/image.png]
(r'\[视频:\s*([^\]]+)\]', 'video'), # [视频: /path/to/video.mp4]
(r'!\[.*?\]\(([^\)]+)\)', 'image'), # ![alt](url) - 默认图片
(r'<img[^>]+src=["\']([^"\']+)["\']', 'image'), # <img src="url">
(r'<video[^>]+src=["\']([^"\']+)["\']', 'video'), # <video src="url">
(r'https?://[^\s]+\.(?:jpg|jpeg|png|gif|webp)', 'image'), # 直接的图片URL
(r'https?://[^\s]+\.(?:mp4|avi|mov|wmv|flv)', 'video'), # 直接的视频URL
]
for pattern, media_type in patterns:
matches = re.findall(pattern, content, re.IGNORECASE)
for match in matches:
media_items.append((match, media_type))
# 去重保持顺序并限制最多5个
seen = set()
unique_items = []
for url, mtype in media_items:
if url not in seen:
seen.add(url)
unique_items.append((url, mtype))
media_items = unique_items[:5]
if media_items:
logger.info(f"[chat_channel] Extracted {len(media_items)} media item(s) from reply")
# 先发送文本(保持原文本不变)
self._send(reply, context)
# 然后逐个发送媒体文件
for i, (url, media_type) in enumerate(media_items):
try:
# 判断是本地文件还是URL
if url.startswith(('http://', 'https://')):
# 网络资源
if media_type == 'video':
# 视频使用 FILE 类型发送
media_reply = Reply(ReplyType.FILE, url)
media_reply.file_name = os.path.basename(url)
else:
# 图片使用 IMAGE_URL 类型
media_reply = Reply(ReplyType.IMAGE_URL, url)
elif os.path.exists(url):
# 本地文件
if media_type == 'video':
# 视频使用 FILE 类型,转换为 file:// URL
media_reply = Reply(ReplyType.FILE, f"file://{url}")
media_reply.file_name = os.path.basename(url)
else:
# 图片使用 IMAGE_URL 类型,转换为 file:// URL
media_reply = Reply(ReplyType.IMAGE_URL, f"file://{url}")
else:
logger.warning(f"[chat_channel] Media file not found or invalid URL: {url}")
continue
# 发送媒体文件(添加小延迟避免频率限制)
if i > 0:
time.sleep(0.5)
self._send(media_reply, context)
logger.info(f"[chat_channel] Sent {media_type} {i+1}/{len(media_items)}: {url[:50]}...")
except Exception as e:
logger.error(f"[chat_channel] Failed to send {media_type} {url}: {e}")
else:
# 没有媒体文件,正常发送文本
self._send(reply, context)
def _send(self, reply: Reply, context: Context, retry_cnt=0):
try:

View File

@@ -9,6 +9,7 @@ import json
# -*- coding=utf-8 -*-
import logging
import time
import requests
import dingtalk_stream
from dingtalk_stream import AckMessage
@@ -107,16 +108,156 @@ class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
conf()["group_name_white_list"] = ["ALL_GROUP"]
# 单聊无需前缀
conf()["single_chat_prefix"] = [""]
# Access token cache
self._access_token = None
self._access_token_expires_at = 0
# Robot code cache (extracted from incoming messages)
self._robot_code = None
def startup(self):
credential = dingtalk_stream.Credential(self.dingtalk_client_id, self.dingtalk_client_secret)
client = dingtalk_stream.DingTalkStreamClient(credential)
client.register_callback_handler(dingtalk_stream.chatbot.ChatbotMessage.TOPIC, self)
client.start_forever()
def get_access_token(self):
"""
获取企业内部应用的 access_token
文档: https://open.dingtalk.com/document/orgapp/obtain-orgapp-token
"""
current_time = time.time()
# 如果 token 还没过期,直接返回缓存的 token
if self._access_token and current_time < self._access_token_expires_at:
return self._access_token
# 获取新的 access_token
url = "https://api.dingtalk.com/v1.0/oauth2/accessToken"
headers = {"Content-Type": "application/json"}
data = {
"appKey": self.dingtalk_client_id,
"appSecret": self.dingtalk_client_secret
}
try:
response = requests.post(url, headers=headers, json=data, timeout=10)
result = response.json()
if response.status_code == 200 and "accessToken" in result:
self._access_token = result["accessToken"]
# Token 有效期为 2 小时,提前 5 分钟刷新
self._access_token_expires_at = current_time + result.get("expireIn", 7200) - 300
logger.info("[DingTalk] Access token refreshed successfully")
return self._access_token
else:
logger.error(f"[DingTalk] Failed to get access token: {result}")
return None
except Exception as e:
logger.error(f"[DingTalk] Error getting access token: {e}")
return None
def send_single_message(self, user_id: str, content: str, robot_code: str) -> bool:
"""
Send message to single user (private chat)
API: https://open.dingtalk.com/document/orgapp/chatbots-send-one-on-one-chat-messages-in-batches
"""
access_token = self.get_access_token()
if not access_token:
logger.error("[DingTalk] Failed to send single message: Access token not available.")
return False
if not robot_code:
logger.error("[DingTalk] Cannot send single message: robot_code is required")
return False
url = "https://api.dingtalk.com/v1.0/robot/oToMessages/batchSend"
headers = {
"x-acs-dingtalk-access-token": access_token,
"Content-Type": "application/json"
}
data = {
"msgParam": json.dumps({"content": content}),
"msgKey": "sampleText",
"userIds": [user_id],
"robotCode": robot_code
}
logger.info(f"[DingTalk] Sending single message to user {user_id} with robot_code {robot_code}")
try:
response = requests.post(url, headers=headers, json=data, timeout=10)
result = response.json()
if response.status_code == 200 and result.get("processQueryKey"):
logger.info(f"[DingTalk] Single message sent successfully to {user_id}")
return True
else:
logger.error(f"[DingTalk] Failed to send single message: {result}")
return False
except Exception as e:
logger.error(f"[DingTalk] Error sending single message: {e}")
return False
def send_group_message(self, conversation_id: str, content: str, robot_code: str = None):
"""
主动发送群消息
文档: https://open.dingtalk.com/document/orgapp/the-robot-sends-a-group-message
Args:
conversation_id: 会话ID (openConversationId)
content: 消息内容
robot_code: 机器人编码,默认使用 dingtalk_client_id
"""
access_token = self.get_access_token()
if not access_token:
logger.error("[DingTalk] Cannot send group message: no access token")
return False
# Validate robot_code
if not robot_code:
logger.error("[DingTalk] Cannot send group message: robot_code is required")
return False
url = "https://api.dingtalk.com/v1.0/robot/groupMessages/send"
headers = {
"x-acs-dingtalk-access-token": access_token,
"Content-Type": "application/json"
}
data = {
"msgParam": json.dumps({"content": content}),
"msgKey": "sampleText",
"openConversationId": conversation_id,
"robotCode": robot_code
}
try:
response = requests.post(url, headers=headers, json=data, timeout=10)
result = response.json()
if response.status_code == 200:
logger.info(f"[DingTalk] Group message sent successfully to {conversation_id}")
return True
else:
logger.error(f"[DingTalk] Failed to send group message: {result}")
return False
except Exception as e:
logger.error(f"[DingTalk] Error sending group message: {e}")
return False
async def process(self, callback: dingtalk_stream.CallbackMessage):
try:
incoming_message = dingtalk_stream.ChatbotMessage.from_dict(callback.data)
# Debug: 打印完整的 event 数据
logger.info(f"[DingTalk] ===== Incoming Message Debug =====")
logger.info(f"[DingTalk] callback.data keys: {callback.data.keys() if hasattr(callback.data, 'keys') else 'N/A'}")
logger.info(f"[DingTalk] incoming_message attributes: {dir(incoming_message)}")
logger.info(f"[DingTalk] robot_code: {getattr(incoming_message, 'robot_code', 'N/A')}")
logger.info(f"[DingTalk] chatbot_corp_id: {getattr(incoming_message, 'chatbot_corp_id', 'N/A')}")
logger.info(f"[DingTalk] chatbot_user_id: {getattr(incoming_message, 'chatbot_user_id', 'N/A')}")
logger.info(f"[DingTalk] conversation_id: {getattr(incoming_message, 'conversation_id', 'N/A')}")
logger.info(f"[DingTalk] Raw callback.data: {callback.data}")
logger.info(f"[DingTalk] =====================================")
image_download_handler = self # 传入方法所在的类实例
dingtalk_msg = DingTalkMessage(incoming_message, image_download_handler)
@@ -174,8 +315,48 @@ class DingTalkChanel(ChatChannel, dingtalk_stream.ChatbotHandler):
def send(self, reply: Reply, context: Context):
receiver = context["receiver"]
isgroup = context.kwargs['msg'].is_group
incoming_message = context.kwargs['msg'].incoming_message
# Check if msg exists (for scheduled tasks, msg might be None)
msg = context.kwargs.get('msg')
if msg is None:
# 定时任务场景:使用主动发送 API
is_group = context.get("isgroup", False)
logger.info(f"[DingTalk] Sending scheduled task message to {receiver} (is_group={is_group})")
# 使用缓存的 robot_code 或配置的值
robot_code = self._robot_code or conf().get("dingtalk_robot_code")
logger.info(f"[DingTalk] Using robot_code: {robot_code}, cached: {self._robot_code}, config: {conf().get('dingtalk_robot_code')}")
if not robot_code:
logger.error(f"[DingTalk] Cannot send scheduled task: robot_code not available. Please send at least one message to the bot first, or configure dingtalk_robot_code in config.json")
return
# 根据是否群聊选择不同的 API
if is_group:
success = self.send_group_message(receiver, reply.content, robot_code)
else:
# 单聊场景:尝试从 context 中获取 dingtalk_sender_staff_id
sender_staff_id = context.get("dingtalk_sender_staff_id")
if not sender_staff_id:
logger.error(f"[DingTalk] Cannot send single chat scheduled message: sender_staff_id not available in context")
return
logger.info(f"[DingTalk] Sending single message to staff_id: {sender_staff_id}")
success = self.send_single_message(sender_staff_id, reply.content, robot_code)
if not success:
logger.error(f"[DingTalk] Failed to send scheduled task message")
return
# 从正常消息中提取并缓存 robot_code
if hasattr(msg, 'robot_code'):
robot_code = msg.robot_code
if robot_code and robot_code != self._robot_code:
self._robot_code = robot_code
logger.info(f"[DingTalk] Cached robot_code: {robot_code}")
isgroup = msg.is_group
incoming_message = msg.incoming_message
if conf().get("dingtalk_card_enabled"):
logger.info("[Dingtalk] sendMsg={}, receiver={}".format(reply, receiver))

View File

@@ -22,6 +22,7 @@ class DingTalkMessage(ChatMessage):
self.create_time = event.create_at
self.image_content = event.image_content
self.rich_text_content = event.rich_text_content
self.robot_code = event.robot_code # 机器人编码
if event.conversation_type == "1":
self.is_group = False
else:

View File

@@ -204,10 +204,36 @@ class FeiShuChanel(ChatChannel):
# 图片上传
reply_content = self._upload_image_url(reply.content, access_token)
if not reply_content:
logger.warning("[FeiShu] upload file failed")
logger.warning("[FeiShu] upload image failed")
return
msg_type = "image"
content_key = "image_key"
elif reply.type == ReplyType.FILE:
# 判断是否为视频文件
file_path = reply.content
if file_path.startswith("file://"):
file_path = file_path[7:]
is_video = file_path.lower().endswith(('.mp4', '.avi', '.mov', '.wmv', '.flv'))
if is_video:
# 视频使用 media 类型
file_key = self._upload_video_url(reply.content, access_token)
if not file_key:
logger.warning("[FeiShu] upload video failed")
return
reply_content = file_key
msg_type = "media"
content_key = "file_key"
else:
# 其他文件使用 file 类型
file_key = self._upload_file_url(reply.content, access_token)
if not file_key:
logger.warning("[FeiShu] upload file failed")
return
reply_content = file_key
msg_type = "file"
content_key = "file_key"
# Check if we can reply to an existing message (need msg_id)
can_reply = is_group and msg and hasattr(msg, 'msg_id') and msg.msg_id
@@ -260,7 +286,34 @@ class FeiShuChanel(ChatChannel):
def _upload_image_url(self, img_url, access_token):
logger.debug(f"[WX] start download image, img_url={img_url}")
logger.debug(f"[FeiShu] start process image, img_url={img_url}")
# Check if it's a local file path (file:// protocol)
if img_url.startswith("file://"):
local_path = img_url[7:] # Remove "file://" prefix
logger.info(f"[FeiShu] uploading local file: {local_path}")
if not os.path.exists(local_path):
logger.error(f"[FeiShu] local file not found: {local_path}")
return None
# Upload directly from local file
upload_url = "https://open.feishu.cn/open-apis/im/v1/images"
data = {'image_type': 'message'}
headers = {'Authorization': f'Bearer {access_token}'}
with open(local_path, "rb") as file:
upload_response = requests.post(upload_url, files={"image": file}, data=data, headers=headers)
logger.info(f"[FeiShu] upload file, res={upload_response.content}")
response_data = upload_response.json()
if response_data.get("code") == 0:
return response_data.get("data").get("image_key")
else:
logger.error(f"[FeiShu] upload failed: {response_data}")
return None
# Original logic for HTTP URLs
response = requests.get(img_url)
suffix = utils.get_path_suffix(img_url)
temp_name = str(uuid.uuid4()) + "." + suffix
@@ -283,6 +336,207 @@ class FeiShuChanel(ChatChannel):
os.remove(temp_name)
return upload_response.json().get("data").get("image_key")
def _upload_video_url(self, video_url, access_token):
"""
Upload video to Feishu and return file_key (for media type messages)
Supports:
- file:// URLs for local files
- http(s):// URLs (download then upload)
"""
# For file:// URLs (local files), upload directly
if video_url.startswith("file://"):
local_path = video_url[7:] # Remove file:// prefix
if not os.path.exists(local_path):
logger.error(f"[FeiShu] local video file not found: {local_path}")
return None
file_name = os.path.basename(local_path)
file_ext = os.path.splitext(file_name)[1].lower()
# Determine file type for Feishu API (for media messages)
# Media type only supports mp4
file_type_map = {
'.mp4': 'mp4',
}
file_type = file_type_map.get(file_ext, 'mp4') # Default to mp4
# Upload video to Feishu (use file upload API, but send as media type)
upload_url = "https://open.feishu.cn/open-apis/im/v1/files"
data = {'file_type': file_type, 'file_name': file_name}
headers = {'Authorization': f'Bearer {access_token}'}
try:
with open(local_path, "rb") as file:
upload_response = requests.post(
upload_url,
files={"file": file},
data=data,
headers=headers,
timeout=(5, 60) # 5s connect, 60s read timeout (videos are larger)
)
logger.info(f"[FeiShu] upload video response, status={upload_response.status_code}, res={upload_response.content}")
response_data = upload_response.json()
if response_data.get("code") == 0:
return response_data.get("data").get("file_key")
else:
logger.error(f"[FeiShu] upload video failed: {response_data}")
return None
except Exception as e:
logger.error(f"[FeiShu] upload video exception: {e}")
return None
# For HTTP URLs, download first then upload
try:
logger.info(f"[FeiShu] Downloading video from URL: {video_url}")
response = requests.get(video_url, timeout=(5, 60))
if response.status_code != 200:
logger.error(f"[FeiShu] download video failed, status={response.status_code}")
return None
# Save to temp file
import uuid
file_name = os.path.basename(video_url) or "video.mp4"
temp_name = str(uuid.uuid4()) + "_" + file_name
with open(temp_name, "wb") as file:
file.write(response.content)
logger.info(f"[FeiShu] Video downloaded, size={len(response.content)} bytes, uploading...")
# Upload
file_ext = os.path.splitext(file_name)[1].lower()
file_type_map = {
'.mp4': 'mp4',
}
file_type = file_type_map.get(file_ext, 'mp4')
upload_url = "https://open.feishu.cn/open-apis/im/v1/files"
data = {'file_type': file_type, 'file_name': file_name}
headers = {'Authorization': f'Bearer {access_token}'}
with open(temp_name, "rb") as file:
upload_response = requests.post(upload_url, files={"file": file}, data=data, headers=headers, timeout=(5, 60))
logger.info(f"[FeiShu] upload video, res={upload_response.content}")
response_data = upload_response.json()
os.remove(temp_name) # Clean up temp file
if response_data.get("code") == 0:
return response_data.get("data").get("file_key")
else:
logger.error(f"[FeiShu] upload video failed: {response_data}")
return None
except Exception as e:
logger.error(f"[FeiShu] upload video from URL exception: {e}")
# Clean up temp file if exists
if 'temp_name' in locals() and os.path.exists(temp_name):
os.remove(temp_name)
return None
def _upload_file_url(self, file_url, access_token):
"""
Upload file to Feishu
Supports both local files (file://) and HTTP URLs
"""
logger.debug(f"[FeiShu] start process file, file_url={file_url}")
# Check if it's a local file path (file:// protocol)
if file_url.startswith("file://"):
local_path = file_url[7:] # Remove "file://" prefix
logger.info(f"[FeiShu] uploading local file: {local_path}")
if not os.path.exists(local_path):
logger.error(f"[FeiShu] local file not found: {local_path}")
return None
# Get file info
file_name = os.path.basename(local_path)
file_ext = os.path.splitext(file_name)[1].lower()
# Determine file type for Feishu API
# Feishu supports: opus, mp4, pdf, doc, xls, ppt, stream (other types)
file_type_map = {
'.opus': 'opus',
'.mp4': 'mp4',
'.pdf': 'pdf',
'.doc': 'doc', '.docx': 'doc',
'.xls': 'xls', '.xlsx': 'xls',
'.ppt': 'ppt', '.pptx': 'ppt',
}
file_type = file_type_map.get(file_ext, 'stream') # Default to stream for other types
# Upload file to Feishu
upload_url = "https://open.feishu.cn/open-apis/im/v1/files"
data = {'file_type': file_type, 'file_name': file_name}
headers = {'Authorization': f'Bearer {access_token}'}
try:
with open(local_path, "rb") as file:
upload_response = requests.post(
upload_url,
files={"file": file},
data=data,
headers=headers,
timeout=(5, 30) # 5s connect, 30s read timeout
)
logger.info(f"[FeiShu] upload file response, status={upload_response.status_code}, res={upload_response.content}")
response_data = upload_response.json()
if response_data.get("code") == 0:
return response_data.get("data").get("file_key")
else:
logger.error(f"[FeiShu] upload file failed: {response_data}")
return None
except Exception as e:
logger.error(f"[FeiShu] upload file exception: {e}")
return None
# For HTTP URLs, download first then upload
try:
response = requests.get(file_url, timeout=(5, 30))
if response.status_code != 200:
logger.error(f"[FeiShu] download file failed, status={response.status_code}")
return None
# Save to temp file
import uuid
file_name = os.path.basename(file_url)
temp_name = str(uuid.uuid4()) + "_" + file_name
with open(temp_name, "wb") as file:
file.write(response.content)
# Upload
file_ext = os.path.splitext(file_name)[1].lower()
file_type_map = {
'.opus': 'opus', '.mp4': 'mp4', '.pdf': 'pdf',
'.doc': 'doc', '.docx': 'doc',
'.xls': 'xls', '.xlsx': 'xls',
'.ppt': 'ppt', '.pptx': 'ppt',
}
file_type = file_type_map.get(file_ext, 'stream')
upload_url = "https://open.feishu.cn/open-apis/im/v1/files"
data = {'file_type': file_type, 'file_name': file_name}
headers = {'Authorization': f'Bearer {access_token}'}
with open(temp_name, "rb") as file:
upload_response = requests.post(upload_url, files={"file": file}, data=data, headers=headers)
logger.info(f"[FeiShu] upload file, res={upload_response.content}")
response_data = upload_response.json()
os.remove(temp_name) # Clean up temp file
if response_data.get("code") == 0:
return response_data.get("data").get("file_key")
else:
logger.error(f"[FeiShu] upload file failed: {response_data}")
return None
except Exception as e:
logger.error(f"[FeiShu] upload file from URL exception: {e}")
return None
def _compose_context(self, ctype: ContextType, content, **kwargs):
context = Context(ctype, content)
context.kwargs = kwargs
@@ -291,13 +545,18 @@ class FeiShuChanel(ChatChannel):
cmsg = context["msg"]
# Set session_id based on chat type to ensure proper session isolation
# Set session_id based on chat type
if cmsg.is_group:
# Group chat: combine user_id and group_id to create unique session per user per group
# This ensures:
# - Same user in different groups have separate conversation histories
# - Same user in private chat and group chat have separate histories
context["session_id"] = f"{cmsg.from_user_id}:{cmsg.other_user_id}"
# Group chat: check if group_shared_session is enabled
if conf().get("group_shared_session", True):
# All users in the group share the same session context
context["session_id"] = cmsg.other_user_id # group_id
else:
# Each user has their own session within the group
# This ensures:
# - Same user in different groups have separate conversation histories
# - Same user in private chat and group chat have separate histories
context["session_id"] = f"{cmsg.from_user_id}:{cmsg.other_user_id}"
else:
# Private chat: use user_id only
context["session_id"] = cmsg.from_user_id

View File

@@ -1,10 +1,12 @@
from bridge.context import ContextType
from channel.chat_message import ChatMessage
import json
import os
import requests
from common.log import logger
from common.tmp_dir import TmpDir
from common import utils
from config import conf
class FeishuMessage(ChatMessage):
@@ -22,6 +24,99 @@ class FeishuMessage(ChatMessage):
self.ctype = ContextType.TEXT
content = json.loads(msg.get('content'))
self.content = content.get("text").strip()
elif msg_type == "image":
# 单张图片消息,不处理和存储
self.ctype = ContextType.IMAGE
content = json.loads(msg.get("content"))
image_key = content.get("image_key")
# 仅记录图片key不下载
self.content = f"[图片: {image_key}]"
logger.info(f"[FeiShu] Received single image message, key={image_key}, skipped download")
elif msg_type == "post":
# 富文本消息,可能包含图片、文本等多种元素
content = json.loads(msg.get("content"))
# 飞书富文本消息结构content 直接包含 title 和 content 数组
# 不是嵌套在 post 字段下
title = content.get("title", "")
content_list = content.get("content", [])
logger.info(f"[FeiShu] Post message - title: '{title}', content_list length: {len(content_list)}")
# 收集所有图片和文本
image_keys = []
text_parts = []
if title:
text_parts.append(title)
for block in content_list:
logger.debug(f"[FeiShu] Processing block: {block}")
# block 本身就是元素列表
if not isinstance(block, list):
continue
for element in block:
element_tag = element.get("tag")
logger.debug(f"[FeiShu] Element tag: {element_tag}, element: {element}")
if element_tag == "img":
# 找到图片元素
image_key = element.get("image_key")
if image_key:
image_keys.append(image_key)
elif element_tag == "text":
# 文本元素
text_content = element.get("text", "")
if text_content:
text_parts.append(text_content)
logger.info(f"[FeiShu] Parsed - images: {len(image_keys)}, text_parts: {text_parts}")
# 富文本消息统一作为文本消息处理
self.ctype = ContextType.TEXT
if image_keys:
# 如果包含图片,下载并在文本中引用本地路径
workspace_root = os.path.expanduser(conf().get("agent_workspace", "~/cow"))
tmp_dir = os.path.join(workspace_root, "tmp")
os.makedirs(tmp_dir, exist_ok=True)
# 保存图片路径映射
self.image_paths = {}
for image_key in image_keys:
image_path = os.path.join(tmp_dir, f"{image_key}.png")
self.image_paths[image_key] = image_path
def _download_images():
for image_key, image_path in self.image_paths.items():
url = f"https://open.feishu.cn/open-apis/im/v1/messages/{self.msg_id}/resources/{image_key}"
headers = {"Authorization": "Bearer " + access_token}
params = {"type": "image"}
response = requests.get(url=url, headers=headers, params=params)
if response.status_code == 200:
with open(image_path, "wb") as f:
f.write(response.content)
logger.info(f"[FeiShu] Image downloaded from post message, key={image_key}, path={image_path}")
else:
logger.error(f"[FeiShu] Failed to download image from post, key={image_key}, status={response.status_code}")
# 立即下载图片,不使用延迟下载
# 因为 TEXT 类型消息不会调用 prepare()
_download_images()
# 构建消息内容:文本 + 图片路径
content_parts = []
if text_parts:
content_parts.append("\n".join(text_parts).strip())
for image_key, image_path in self.image_paths.items():
content_parts.append(f"[图片: {image_path}]")
self.content = "\n".join(content_parts)
logger.info(f"[FeiShu] Received post message with {len(image_keys)} image(s) and text: {self.content}")
else:
# 纯文本富文本消息
self.content = "\n".join(text_parts).strip() if text_parts else "[富文本消息]"
logger.info(f"[FeiShu] Received post message (text only): {self.content}")
elif msg_type == "file":
self.ctype = ContextType.FILE
content = json.loads(msg.get("content"))

View File

@@ -20,9 +20,7 @@
"Agent测试群",
"ChatGPT测试群2"
],
"image_create_prefix": [
"画"
],
"image_create_prefix": [""],
"speech_recognition": true,
"group_speech_recognition": false,
"voice_reply_voice": false,

View File

@@ -35,6 +35,7 @@ available_setting = {
"group_name_white_list": ["ChatGPT测试群", "ChatGPT测试群2"], # 开启自动回复的群名称列表
"group_name_keyword_white_list": [], # 开启自动回复的群名称关键词列表
"group_chat_in_one_session": ["ChatGPT测试群"], # 支持会话上下文共享的群名称
"group_shared_session": True, # 群聊是否共享会话上下文所有成员共享默认为True。False时每个用户在群内有独立会话
"nick_name_black_list": [], # 用户昵称黑名单
"group_welcome_msg": "", # 配置新人进群固定欢迎语,不配置则使用随机风格欢迎
"trigger_by_self": False, # 是否允许机器人触发

View File

@@ -365,6 +365,7 @@ class ClaudeAPIBot(Bot, OpenAIImage):
# Track tool use state
tool_uses_map = {} # {index: {id, name, input}}
current_tool_use_index = -1
stop_reason = None # Track stop reason from Claude
try:
# Make streaming HTTP request
@@ -440,6 +441,12 @@ class ClaudeAPIBot(Bot, OpenAIImage):
tool_uses_map[current_tool_use_index]["input"] += delta.get("partial_json", "")
elif event_type == "message_delta":
# Extract stop_reason from delta
delta = event.get("delta", {})
if "stop_reason" in delta:
stop_reason = delta.get("stop_reason")
logger.info(f"[Claude] Stream stop_reason: {stop_reason}")
# Message complete - yield tool calls if any
if tool_uses_map:
for idx in sorted(tool_uses_map.keys()):
@@ -462,9 +469,13 @@ class ClaudeAPIBot(Bot, OpenAIImage):
}
}]
},
"finish_reason": None
"finish_reason": stop_reason
}]
}
elif event_type == "message_stop":
# Final event - log completion
logger.debug(f"[Claude] Stream completed with stop_reason: {stop_reason}")
except json.JSONDecodeError:
continue

View File

@@ -0,0 +1,297 @@
# LinkAI Agent Skill
这个 skill 允许你调用 LinkAI 平台上的多个应用(App)和工作流(Workflow),通过简单的配置即可集成多个智能体能力。
## 特性
-**多应用支持** - 在一个配置文件中管理多个 LinkAI 应用/工作流
-**动态加载** - skill 系统加载时自动从 `config.json` 读取应用列表
-**自动技能描述** - 所有配置的应用会自动添加到技能描述中
-**模型切换** - 可以为每个请求指定不同的模型
-**知识库集成** - 支持应用绑定的知识库
-**插件能力** - 支持应用启用的各类插件
-**工作流执行** - 支持执行复杂的多步骤工作流
## 快速开始
### 1. 配置 API Key
```bash
env_config(action="set", key="LINKAI_API_KEY", value="your-linkai-api-key")
```
获取 API Key: https://link-ai.tech/console/interface
### 2. 配置应用列表
`config.json.template` 复制为 `config.json`
```bash
cp config.json.template config.json
```
编辑 `config.json`,添加你的应用/工作流:
```json
{
"apps": [
{
"app_code": "G7z6vKwp",
"app_name": "通用助手",
"app_description": "通用AI助手可以回答各类问题"
},
{
"app_code": "your_kb_app",
"app_name": "产品文档助手",
"app_description": "基于产品文档知识库的问答助手"
},
{
"app_code": "your_workflow",
"app_name": "数据分析工作流",
"app_description": "执行数据清洗、分析和可视化的完整工作流"
}
]
}
```
**注意:** 修改 `config.json`Agent 在下次加载技能时会自动读取新配置。
### 3. 调用应用
```bash
bash scripts/call.sh "G7z6vKwp" "What is artificial intelligence?"
```
## 使用示例
### 基础调用
```bash
# 调用默认模型
bash scripts/call.sh "G7z6vKwp" "解释一下量子计算"
```
### 指定模型
```bash
# 使用 GPT-4.1 模型
bash scripts/call.sh "G7z6vKwp" "写一篇关于AI的文章" "LinkAI-4.1"
# 使用 DeepSeek 模型
bash scripts/call.sh "G7z6vKwp" "帮我写代码" "deepseek-chat"
# 使用 Claude 模型
bash scripts/call.sh "G7z6vKwp" "分析这段文本" "claude-4-sonnet"
```
### 调用工作流
```bash
# 工作流会按照配置的节点顺序执行
bash scripts/call.sh "workflow_code" "输入数据或问题"
```
## ⚠️ 重要提示
### 超时配置
LinkAI 应用(特别是视频/图片生成、复杂工作流)可能需要较长时间处理。
**脚本内置超时**
- 默认120 秒(适合大多数场景)
- 可通过第 5 个参数自定义:`bash scripts/call.sh <app_code> <question> "" "false" "180"`
**推荐超时时间**
- **文本问答**120 秒(默认)
- **图片生成**120-180 秒
- **视频生成**180-300 秒
Agent 调用时会自动设置合适的超时时间。
## 配置说明
### config.json 字段
| 字段 | 类型 | 说明 |
|------|------|------|
| `app_code` | string | 应用或工作流的唯一标识码,从 LinkAI 控制台获取 |
| `app_name` | string | 应用名称,会显示在技能描述中 |
| `app_description` | string | 应用功能描述,帮助 Agent 理解何时使用该应用 |
### 获取 app_code
1. 登录 [LinkAI 控制台](https://link-ai.tech/console)
2. 进入「应用管理」或「工作流管理」
3. 选择要集成的应用/工作流
4. 在应用详情页找到 `app_code`
## 支持的模型
LinkAI 支持多种主流 AI 模型:
**OpenAI 系列:**
- `LinkAI-4.1` - GPT-4.1 (1000K 上下文)
- `LinkAI-4.1-mini` - GPT-4.1 mini (1000K)
- `LinkAI-4.1-nano` - GPT-4.1 nano (1000K)
- `LinkAI-4o` - GPT-4o (128K)
- `LinkAI-4o-mini` - GPT-4o mini (128K)
**DeepSeek 系列:**
- `deepseek-chat` - DeepSeek-V3 对话模型 (64K)
- `deepseek-reasoner` - DeepSeek-R1 推理模型 (64K)
**Claude 系列:**
- `claude-4-sonnet` - Claude 4 Sonnet (200K)
- `claude-3-7-sonnet` - Claude 3.7 (200K)
- `claude-3-5-sonnet` - Claude 3.5 (200K)
**Google 系列:**
- `gemini-2.5-pro` - Gemini 2.5 Pro (1000K)
- `gemini-2.0-flash` - Gemini 2.0 Flash (1000K)
**国产模型:**
- `qwen3` - 通义千问3 (128K)
- `wenxin-4.5` - 文心一言4.5 (8K)
- `doubao-1.5-pro-256k` - 豆包1.5 (256K)
- `glm-4-plus` - 智谱GLM-4-Plus (4K)
完整模型列表https://link-ai.tech/console/models
## 应用类型
### 1. 普通应用
配置了系统提示词和参数的标准对话应用,可以:
- 设置角色和性格
- 绑定知识库
- 启用插件(图像识别、网页搜索、代码执行等)
### 2. 知识库应用
基于特定知识库的问答应用,适合:
- 企业内部知识库
- 产品文档问答
- 客户支持
### 3. 工作流
多步骤的自动化流程,可以:
- 串联多个处理节点
- 条件分支
- 循环处理
- 调用外部 API
## 响应格式
### 成功响应
```json
{
"app_code": "G7z6vKwp",
"content": "人工智能AI是计算机科学的一个分支...",
"usage": {
"prompt_tokens": 10,
"completion_tokens": 150,
"total_tokens": 160
}
}
```
### 错误响应
```json
{
"error": "LinkAI API error",
"message": "应用不存在",
"response": { ... }
}
```
## 常见错误
### LINKAI_API_KEY environment variable is not set
**原因:** 未配置 API Key
**解决:** 使用 `env_config` 工具设置 LINKAI_API_KEY
### 应用不存在 (402)
**原因:** app_code 不正确或应用已删除
**解决:** 检查 app_code 是否正确,确认应用存在
### 无访问权限 (403)
**原因:** 尝试访问他人的私有应用
**解决:** 确保应用是公开的或你是创建者
### 账号积分额度不足 (406)
**原因:** LinkAI 账户余额不足
**解决:** 前往控制台充值
### 内容审核不通过 (409)
**原因:** 请求或响应包含敏感内容
**解决:** 修改输入内容,避免敏感词
## 技术实现
### 自动技能描述生成
当 skill 系统加载 `linkai-agent` 时,会自动:
1. 读取 `config.json` 中的应用列表
2. 将每个应用的 name 和 description 动态添加到技能描述中
3. Agent 加载时会看到完整的应用列表
这是在 `agent/skills/loader.py` 中实现的特殊处理。
### 工作流程
```
用户配置 config.json
Agent 启动/重新加载技能
SkillLoader 检测到 linkai-agent
动态读取 config.json
生成包含所有应用描述的 description
Agent 看到所有可用应用的完整信息
用户请求触发
Agent 根据描述选择合适的应用
调用 call.sh <app_code> <question>
LinkAI API 处理并返回结果
```
## 最佳实践
1. **清晰的描述** - 为每个应用写清晰、具体的描述,帮助 Agent 理解应用用途
2. **合理分工** - 不同应用负责不同领域,避免功能重叠
3. **无需重启** - 修改 config.json 后Agent 下次加载技能时会自动更新
4. **模型选择** - 根据任务复杂度选择合适的模型
5. **知识库优化** - 为专业领域的应用绑定相关知识库
## 扩展用法
### 在 Agent 系统中使用
当 Agent 系统加载这个 skill 时,会自动从 `config.json` 读取应用列表并生成描述:
```
Call LinkAI apps/workflows. 通用助手(G7z6vKwp: 通用AI助手可以回答各类问题); 产品文档助手(kb_app_001: 基于产品文档知识库的问答助手); 数据分析工作流(wf_002: 执行数据清洗、分析和可视化的完整工作流)
```
Agent 会根据用户问题自动选择最合适的应用进行调用。
## 相关链接
- LinkAI 平台: https://link-ai.tech
- API 文档: https://docs.link-ai.tech
- 控制台: https://link-ai.tech/console
- 模型列表: https://link-ai.tech/console/models
- 应用广场: https://link-ai.tech/square
## License
Part of the chatgpt-on-wechat project.

View File

@@ -0,0 +1,165 @@
---
name: linkai-agent
description: Call LinkAI applications and workflows. Use bash command to execute like 'bash <base_dir>/scripts/call.sh <app_code> <question>'.
homepage: https://link-ai.tech
metadata:
emoji: 🤖
requires:
bins: ["curl"]
env: ["LINKAI_API_KEY"]
primaryEnv: "LINKAI_API_KEY"
---
# LinkAI Agent Caller
Call LinkAI applications and workflows through API. Supports multiple apps/workflows configured in config.json.
The available apps are dynamically loaded from `config.json` at skill loading time.
## Setup
This skill requires a LinkAI API key. If not configured:
1. Get your API key from https://link-ai.tech/console/api-keys
2. Set the key using: `env_config(action="set", key="LINKAI_API_KEY", value="your-key")`
## Configuration
1. Copy `config.json.template` to `config.json`
2. Configure your apps/workflows:
```json
{
"apps": [
{
"app_code": "your_app_code",
"app_name": "App Name",
"app_description": "What this app does"
}
]
}
```
3. The skill description will be automatically updated when the agent loads this skill
## Usage
**Important**: Scripts are located relative to this skill's base directory.
When you see this skill in `<available_skills>`, note the `<base_dir>` path.
**CRITICAL**: Always use `bash` command to execute the script:
```bash
# General pattern (MUST start with bash):
bash "<base_dir>/scripts/call.sh" "<app_code>" "<question>" [model] [stream] [timeout]
# DO NOT execute the script directly like this (WRONG):
# "<base_dir>/scripts/call.sh" ...
# Parameters:
# - app_code: LinkAI app or workflow code (required)
# - question: User question (required)
# - model: Override model (optional, uses app default if not specified)
# - stream: Enable streaming (true/false, default: false)
# - timeout: curl timeout in seconds (default: 120, recommended for video/image generation)
```
**IMPORTANT - Timeout Configuration**:
- The script has a **default timeout of 120 seconds** (suitable for most cases)
- For complex tasks (video generation, large workflows), pass a longer timeout as the 5th parameter
- The bash tool also needs sufficient timeout - set its `timeout` parameter accordingly
- Example: `bash(command="bash <script> <app_code> <question> '' 'false' 180", timeout=200)`
## Examples
### Call an app (uses default 60s timeout)
```bash
bash(command='bash "<base_dir>/scripts/call.sh" "G7z6vKwp" "What is AI?"', timeout=60)
```
### Call an app with specific model
```bash
bash(command='bash "<base_dir>/scripts/call.sh" "G7z6vKwp" "Explain machine learning" "LinkAI-4.1"', timeout=60)
```
### Call a workflow with custom timeout (video generation)
```bash
# Pass timeout as 5th parameter to script, and set bash timeout slightly longer
bash(command='bash "<base_dir>/scripts/call.sh" "workflow_code" "Generate a sunset video" "" "false" "180"', timeout=180)
```
```bash
bash "<base_dir>/scripts/call.sh" "workflow_code" "Analyze this data: ..."
```
## Supported Models
You can specify any LinkAI supported model:
- `LinkAI-4.1` - Latest GPT-4.1 model (1000K context)
- `LinkAI-4.1-mini` - GPT-4.1 mini (1000K context)
- `LinkAI-4o` - GPT-4o model (128K context)
- `LinkAI-4o-mini` - GPT-4o mini (128K context)
- `deepseek-chat` - DeepSeek-V3 (64K context)
- `deepseek-reasoner` - DeepSeek-R1 reasoning model
- `claude-4-sonnet` - Claude 4 Sonnet (200K context)
- `gemini-2.5-pro` - Gemini 2.5 Pro (1000K context)
- And many more...
Full model list: https://link-ai.tech/console/models
## Response Format
Success response:
```json
{
"app_code": "G7z6vKwp",
"content": "AI stands for Artificial Intelligence...",
"usage": {
"prompt_tokens": 10,
"completion_tokens": 50,
"total_tokens": 60
}
}
```
Error response:
```json
{
"error": "Error description",
"message": "Detailed error message"
}
```
## Features
-**Multiple Apps**: Configure and call multiple LinkAI apps/workflows
-**Dynamic Loading**: Apps are loaded from config.json at runtime
-**Model Override**: Optionally specify model per request
-**Streaming Support**: Enable streaming output
-**Knowledge Base**: Apps can use configured knowledge bases
-**Plugins**: Apps can use enabled plugins (image recognition, web search, etc.)
-**Workflows**: Execute complex multi-step workflows
## Notes
- Each app/workflow maintains its own configuration (prompt, model, temperature, etc.)
- Apps can have knowledge bases attached for domain-specific Q&A
- Workflows execute from start node to end node and return final output
- Token usage and costs depend on the model used
- See LinkAI documentation for pricing: https://link-ai.tech/console/funds
- The skill description is automatically generated from config.json when loaded
## Troubleshooting
**"LINKAI_API_KEY environment variable is not set"**
- Use env_config tool to set the API key
**"app_code is required"**
- Make sure you're passing the app_code as the first parameter
**"应用不存在" (App not found)**
- Check that the app_code is correct
- Ensure you have access to the app
**"账号积分额度不足" (Insufficient credits)**
- Top up your LinkAI account credits

View File

@@ -0,0 +1,14 @@
{
"apps": [
{
"app_code": "your_app_code_2",
"app_name": "知识库助手",
"app_description": "基于特定领域知识库提供智能问答的知识助手"
},
{
"app_code": "your_workflow_code",
"app_name": "数据分析工作流",
"app_description": "用于数据分析任务的工作流程"
}
]
}

View File

@@ -0,0 +1,138 @@
#!/usr/bin/env bash
# LinkAI Agent Caller
# API Docs: https://api.link-ai.tech/v1/chat/completions
set -euo pipefail
app_code="${1:-}"
question="${2:-}"
model="${3:-}"
stream="${4:-false}"
timeout="${5:-120}" # Default 120 seconds for video/image generation
if [ -z "$app_code" ]; then
echo '{"error": "app_code is required", "usage": "bash call.sh <app_code> <question> [model] [stream] [timeout]"}'
exit 1
fi
if [ -z "$question" ]; then
echo '{"error": "question is required", "usage": "bash call.sh <app_code> <question> [model] [stream] [timeout]"}'
exit 1
fi
if [ -z "${LINKAI_API_KEY:-}" ]; then
echo '{"error": "LINKAI_API_KEY environment variable is not set", "help": "Use env_config to set LINKAI_API_KEY"}'
exit 1
fi
# API endpoint
api_url="https://api.link-ai.tech/v1/chat/completions"
# Build JSON request body
if [ -n "$model" ]; then
request_body=$(cat <<EOF
{
"app_code": "$app_code",
"model": "$model",
"messages": [
{
"role": "user",
"content": "$question"
}
],
"stream": $stream
}
EOF
)
else
request_body=$(cat <<EOF
{
"app_code": "$app_code",
"messages": [
{
"role": "user",
"content": "$question"
}
],
"stream": $stream
}
EOF
)
fi
# Call LinkAI API
response=$(curl -sS --max-time "$timeout" \
-X POST \
-H "Authorization: Bearer $LINKAI_API_KEY" \
-H "Content-Type: application/json" \
-d "$request_body" \
"$api_url" 2>&1)
curl_exit_code=$?
if [ $curl_exit_code -ne 0 ]; then
echo "{\"error\": \"Failed to call LinkAI API\", \"details\": \"$response\"}"
exit 1
fi
# Simple JSON validation
if [[ ! "$response" =~ ^[[:space:]]*[\{\[] ]]; then
echo "{\"error\": \"Invalid JSON response from API\", \"response\": \"$response\"}"
exit 1
fi
# Check for API error (top-level error only, not content_filter_result)
if echo "$response" | grep -q '^[[:space:]]*{[[:space:]]*"error"[[:space:]]*:' || echo "$response" | grep -q '"error"[[:space:]]*:[[:space:]]*{[^}]*"code"[[:space:]]*:[[:space:]]*"[^"]*"[^}]*"message"'; then
# Make sure it's not just content_filter_result inside choices
if ! echo "$response" | grep -q '"choices"[[:space:]]*:[[:space:]]*\['; then
# Extract error message
error_msg=$(echo "$response" | grep -o '"message"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"message"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
error_code=$(echo "$response" | grep -o '"code"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"code"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
if [ -z "$error_msg" ]; then
error_msg="Unknown API error"
fi
# Provide friendly error message for content filter
if [ "$error_code" = "content_filter_error" ] || echo "$error_msg" | grep -qi "content.*filter"; then
echo "{\"error\": \"内容安全审核\", \"message\": \"您的问题或应用返回的内容触发了LinkAI的安全审核机制请换一种方式提问或检查应用配置\", \"details\": \"$error_msg\"}"
else
echo "{\"error\": \"LinkAI API error\", \"message\": \"$error_msg\", \"code\": \"$error_code\"}"
fi
exit 1
fi
fi
# For non-stream mode, extract and format the response
if [ "$stream" = "false" ]; then
# Extract content from response
content=$(echo "$response" | grep -o '"content"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"content"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
# Extract usage information
prompt_tokens=$(echo "$response" | grep -o '"prompt_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
completion_tokens=$(echo "$response" | grep -o '"completion_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
total_tokens=$(echo "$response" | grep -o '"total_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
if [ -n "$content" ]; then
# Unescape JSON content
content=$(echo "$content" | sed 's/\\n/\n/g' | sed 's/\\"/"/g')
cat <<EOF
{
"app_code": "$app_code",
"content": "$content",
"usage": {
"prompt_tokens": ${prompt_tokens:-0},
"completion_tokens": ${completion_tokens:-0},
"total_tokens": ${total_tokens:-0}
}
}
EOF
else
# Return full response if we can't extract content
echo "$response"
fi
else
# For stream mode, return raw response (caller needs to handle streaming)
echo "$response"
fi

View File

@@ -0,0 +1,168 @@
# OpenAI Image Vision - Usage Examples
## Setup
Set up your API credentials using the agent's env_config tool:
```bash
# Set your OpenAI API key
env_config(action="set", key="OPENAI_API_KEY", value="sk-your-api-key-here")
# Optional: Set custom API base URL (for proxy or compatible services)
env_config(action="set", key="OPENAI_API_BASE", value="https://api.openai.com/v1")
```
## Example 1: Analyze a Local Image
```bash
bash scripts/vision.sh "/path/to/photo.jpg" "What's in this image?"
```
**Expected Output:**
```json
{
"model": "gpt-4.1-mini",
"content": "The image shows a beautiful landscape with mountains in the background and a lake in the foreground. The sky is clear with some clouds, and there are trees along the shoreline.",
"usage": {
"prompt_tokens": 1234,
"completion_tokens": 45,
"total_tokens": 1279
}
}
```
## Example 2: Analyze an Image from URL
```bash
bash scripts/vision.sh "https://example.com/image.jpg" "Describe this image in detail"
```
## Example 3: Extract Text (OCR)
```bash
bash scripts/vision.sh "document.png" "Extract all text from this image"
```
**Use Case:** Extract text from screenshots, scanned documents, or photos of text.
## Example 4: Identify Objects
```bash
bash scripts/vision.sh "scene.jpg" "List all objects you can identify in this image"
```
## Example 5: Analyze Colors and Composition
```bash
bash scripts/vision.sh "artwork.jpg" "Describe the color palette and composition of this image"
```
## Example 6: Count Items
```bash
bash scripts/vision.sh "crowd.jpg" "How many people are in this image?"
```
## Example 7: Use Different Models
```bash
# Use gpt-4.1-mini (default, latest mini model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1-mini"
# Use gpt-4.1 (most capable, best for complex analysis)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1"
# Use gpt-4o-mini (previous mini model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4o-mini"
```
## Example 8: Complex Analysis
```bash
bash scripts/vision.sh "product.jpg" "Analyze this product image. Describe the product, its features, colors, and suggest what kind of marketing copy would work well for it."
```
## Example 9: Safety and Content Moderation
```bash
bash scripts/vision.sh "content.jpg" "Is there any inappropriate or unsafe content in this image?"
```
## Example 10: Technical Analysis
```bash
bash scripts/vision.sh "diagram.png" "Explain what this technical diagram represents and how it works"
```
## Integration with Agent
When the agent loads this skill, it will be available in the `<available_skills>` section. The agent can use it like:
```bash
bash "<base_dir>/scripts/vision.sh" "user_uploaded_image.jpg" "What's in this image?"
```
The `<base_dir>` will be automatically provided by the skill system.
## Error Handling Examples
### Missing API Key
```bash
$ bash scripts/vision.sh "image.jpg" "What is this?"
{"error": "OPENAI_API_KEY environment variable is not set", "help": "Visit https://platform.openai.com/api-keys to get an API key"}
```
### File Not Found
```bash
$ bash scripts/vision.sh "nonexistent.jpg" "What is this?"
{"error": "Image file not found", "path": "nonexistent.jpg"}
```
### Unsupported Format
```bash
$ bash scripts/vision.sh "file.bmp" "What is this?"
{"error": "Unsupported image format", "extension": "bmp", "supported": ["jpg", "jpeg", "png", "gif", "webp"]}
```
### Missing Parameters
```bash
$ bash scripts/vision.sh
{"error": "Image path or URL is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}
```
## Tips for Best Results
1. **Be Specific**: Ask clear, specific questions about what you want to know
2. **Image Quality**: Higher quality images generally produce better results
3. **Model Selection**:
- Use `gpt-4.1` for complex analysis requiring highest accuracy
- Use `gpt-4.1-mini` (default) for most tasks - latest mini model with good balance
4. **Text Extraction**: For OCR tasks, ensure text is clearly visible and not too small
5. **Multiple Aspects**: You can ask about multiple things in one question
6. **Context**: Provide context in your question if needed (e.g., "This is a medical scan, what do you see?")
## Performance Notes
- **Local Files**: Automatically base64-encoded, adds ~33% size overhead
- **URLs**: Passed directly to API, no encoding overhead
- **Timeout**: 60 seconds for API calls
- **Max Tokens**: 1000 tokens for responses (configurable in script)
- **Rate Limits**: Subject to your OpenAI API plan
## Supported Image Formats
✅ JPEG (`.jpg`, `.jpeg`)
✅ PNG (`.png`)
✅ GIF (`.gif`)
✅ WebP (`.webp`)
❌ BMP, TIFF, SVG, and other formats are not supported
## Cost Considerations
Vision API calls cost more than text-only calls because they include image tokens. Costs vary by:
- Model used (gpt-4.1 vs gpt-4.1-mini)
- Image size and resolution
- Length of response
Check OpenAI's pricing page for current rates: https://openai.com/pricing

View File

@@ -0,0 +1,178 @@
# OpenAI Image Vision Skill
This skill enables image analysis using OpenAI's Vision API (GPT-4 Vision models).
## Features
- ✅ Analyze images from local files or URLs
- ✅ Support for multiple image formats (JPEG, PNG, GIF, WebP)
- ✅ Automatic base64 encoding for local files
- ✅ Direct URL passing for remote images
- ✅ Configurable model selection
- ✅ Custom API base URL support
- ✅ Pure bash/curl implementation (no Python dependencies)
## Quick Start
1. **Set up API credentials using env_config:**
```bash
env_config(action="set", key="OPENAI_API_KEY", value="sk-your-api-key-here")
# Optional: custom API base
env_config(action="set", key="OPENAI_API_BASE", value="https://api.openai.com/v1")
```
2. **Analyze an image:**
```bash
bash scripts/vision.sh "/path/to/photo.jpg" "What's in this image?"
```
3. **Analyze from URL:**
```bash
bash scripts/vision.sh "https://example.com/image.jpg" "Describe this image"
```
```bash
bash scripts/vision.sh "/path/to/image.jpg" "What's in this image?"
```
3. **Analyze from URL:**
```bash
bash scripts/vision.sh "https://example.com/image.jpg" "Describe this image"
```
## Usage Examples
### Basic image analysis
```bash
bash scripts/vision.sh "photo.jpg" "What objects can you see?"
```
### Text extraction (OCR)
```bash
bash scripts/vision.sh "document.png" "Extract all text from this image"
```
### Detailed description
```bash
bash scripts/vision.sh "scene.jpg" "Describe this scene in detail, including colors, mood, and composition"
```
### Using different models
```bash
# Use gpt-4.1-mini (default, latest mini model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1-mini"
# Use gpt-4.1 (most capable, latest model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4.1"
# Use gpt-4o-mini (previous mini model)
bash scripts/vision.sh "image.jpg" "Analyze this" "gpt-4o-mini"
```
## Environment Variables
| Variable | Required | Default | Description |
|----------|----------|---------|-------------|
| `OPENAI_API_KEY` | Yes | - | Your OpenAI API key |
| `OPENAI_API_BASE` | No | `https://api.openai.com/v1` | Custom API base URL |
## Response Format
Success response:
```json
{
"model": "gpt-4.1-mini",
"content": "The image shows a beautiful sunset over mountains...",
"usage": {
"prompt_tokens": 1234,
"completion_tokens": 567,
"total_tokens": 1801
}
}
```
Error response:
```json
{
"error": "Error description",
"details": "Additional information"
}
```
## Supported Models
- `gpt-4.1-mini` (default) - Latest mini model, fast and cost-effective
- `gpt-4.1` - Latest GPT-4 variant, most capable
- `gpt-4o-mini` - Previous generation mini model
- `gpt-4-turbo` - Previous generation turbo model
## Supported Image Formats
- JPEG (`.jpg`, `.jpeg`)
- PNG (`.png`)
- GIF (`.gif`)
- WebP (`.webp`)
## Technical Details
- **Implementation**: Pure bash script using curl and base64
- **Timeout**: 60 seconds for API calls
- **Max tokens**: 1000 tokens for responses
- **Image handling**:
- Local files are automatically base64-encoded
- URLs are passed directly to the API
- MIME types are auto-detected from file extensions
## Error Handling
The script handles various error cases:
- Missing required parameters
- Missing API key
- File not found
- Unsupported image formats
- API errors
- Network timeouts
- Invalid JSON responses
## Integration with Agent System
When loaded by the agent system, this skill will appear in `<available_skills>` with a `<base_dir>` path. Use it like:
```bash
bash "<base_dir>/scripts/vision.sh" "image.jpg" "What's in this image?"
```
The agent will automatically:
- Load environment variables from `~/.cow/.env`
- Provide the correct `<base_dir>` path
- Handle skill discovery and registration
## Notes
- Images are sent to OpenAI's servers for processing
- Large images may be automatically resized by the API
- Rate limits depend on your OpenAI API plan
- Token usage includes both the image and text in the prompt
- Base64 encoding increases the size of local images by ~33%
## Troubleshooting
**"OPENAI_API_KEY environment variable is not set"**
- Set the environment variable using env_config tool
- Or use the agent's env_config tool
**"Image file not found"**
- Check the file path is correct
- Use absolute paths or paths relative to current directory
**"Unsupported image format"**
- Only JPEG, PNG, GIF, and WebP are supported
- Check the file extension matches the actual format
**"Failed to call OpenAI API"**
- Check your internet connection
- Verify the API key is valid
- Check if custom API base URL is correct
## License
Part of the chatgpt-on-wechat project.

View File

@@ -0,0 +1,119 @@
---
name: openai-image-vision
description: Analyze images using OpenAI's Vision API. Use bash command to execute the vision script like 'bash <base_dir>/scripts/vision.sh <image> <question>'. Can understand image content, objects, text, colors, and answer questions about images.
homepage: https://platform.openai.com/docs/guides/vision
metadata:
emoji: 👁️
requires:
bins: ["curl", "base64"]
env: ["OPENAI_API_KEY"]
primaryEnv: "OPENAI_API_KEY"
---
# OpenAI Image Vision
Analyze images using OpenAI's GPT-4 Vision API. The model can understand visual elements including objects, shapes, colors, textures, and text within images.
## Setup
This skill requires an OpenAI API key. If not configured:
1. Get your API key from https://platform.openai.com/api-keys
2. Set the key using: `env_config(action="set", key="OPENAI_API_KEY", value="your-key")`
Optional: Set custom API base URL (default: https://api.openai.com/v1):
```bash
env_config(action="set", key="OPENAI_API_BASE", value="your-base-url")
```
## Usage
**Important**: Scripts are located relative to this skill's base directory.
When you see this skill in `<available_skills>`, note the `<base_dir>` path.
**CRITICAL**: Always use `bash` command to execute the script:
```bash
# General pattern (MUST start with bash):
bash "<base_dir>/scripts/vision.sh" "<image_path_or_url>" "<question>" [model]
# DO NOT execute the script directly like this (WRONG):
# "<base_dir>/scripts/vision.sh" ...
# Parameters:
# - image_path_or_url: Local image file path or HTTP(S) URL (required)
# - question: Question to ask about the image (required)
# - model: OpenAI model to use (default: gpt-4.1-mini)
# Options: gpt-4.1-mini, gpt-4.1, gpt-4o-mini, gpt-4-turbo
```
## Examples
### Analyze a local image
```bash
bash "<base_dir>/scripts/vision.sh" "/path/to/image.jpg" "What's in this image?"
```
### Analyze an image from URL
```bash
bash "<base_dir>/scripts/vision.sh" "https://example.com/image.jpg" "Describe this image in detail"
```
### Use specific model
```bash
bash "<base_dir>/scripts/vision.sh" "/path/to/photo.png" "What colors are prominent?" "gpt-4o-mini"
```
### Extract text from image
```bash
bash "<base_dir>/scripts/vision.sh" "/path/to/document.jpg" "Extract all text from this image"
```
### Analyze multiple aspects
```bash
bash "<base_dir>/scripts/vision.sh" "image.jpg" "List all objects you can see and describe the overall scene"
```
## Supported Image Formats
- JPEG (.jpg, .jpeg)
- PNG (.png)
- GIF (.gif)
- WebP (.webp)
**Performance Optimization**: Files larger than 1MB are automatically compressed to 800px (longest side) to avoid command-line parameter limits. This happens transparently without affecting analysis quality.
## Response Format
The script returns a JSON response:
```json
{
"model": "gpt-4.1-mini",
"content": "The image shows...",
"usage": {
"prompt_tokens": 1234,
"completion_tokens": 567,
"total_tokens": 1801
}
}
```
Or in case of error:
```json
{
"error": "Error description",
"details": "Additional error information"
}
```
## Notes
- **Image size**: Images are automatically resized if too large
- **Timeout**: 60 seconds for API calls
- **Rate limits**: Subject to your OpenAI API plan limits
- **Privacy**: Images are sent to OpenAI's servers for processing
- **Local files**: Automatically converted to base64 for API submission
- **URLs**: Can be passed directly to the API without downloading

View File

@@ -0,0 +1,233 @@
#!/usr/bin/env bash
# OpenAI Vision API wrapper
# API Docs: https://platform.openai.com/docs/guides/vision
set -euo pipefail
image_input="${1:-}"
question="${2:-}"
model="${3:-gpt-4.1-mini}"
if [ -z "$image_input" ]; then
echo '{"error": "Image path or URL is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
exit 1
fi
if [ -z "$question" ]; then
echo '{"error": "Question is required", "usage": "bash vision.sh <image_path_or_url> <question> [model]"}'
exit 1
fi
if [ -z "${OPENAI_API_KEY:-}" ]; then
echo '{"error": "OPENAI_API_KEY environment variable is not set", "help": "Visit https://platform.openai.com/api-keys to get an API key"}'
exit 1
fi
# Set API base URL (default to OpenAI's official endpoint)
api_base="${OPENAI_API_BASE:-https://api.openai.com/v1}"
# Remove trailing slash if present
api_base="${api_base%/}"
# Determine if input is a URL or local file
if [[ "$image_input" =~ ^https?:// ]]; then
# It's a URL - use it directly
image_url="$image_input"
# Build JSON request body with URL
request_body=$(cat <<EOF
{
"model": "$model",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "$question"
},
{
"type": "image_url",
"image_url": {
"url": "$image_url"
}
}
]
}
],
"max_tokens": 1000
}
EOF
)
else
# It's a local file - need to encode as base64
if [ ! -f "$image_input" ]; then
echo "{\"error\": \"Image file not found\", \"path\": \"$image_input\"}"
exit 1
fi
# Check file size and compress if needed to avoid "Argument list too long" error
# Files larger than 1MB should be compressed
file_size=$(wc -c < "$image_input" | tr -d ' ')
max_size=1048576 # 1MB
image_to_encode="$image_input"
temp_compressed=""
if [ "$file_size" -gt "$max_size" ]; then
# File is too large, compress it
temp_compressed=$(mktemp "${TMPDIR:-/tmp}/vision_compressed_XXXXXX.jpg")
# Use sips (macOS) or convert (ImageMagick) to compress
if command -v sips &> /dev/null; then
# macOS: resize to max 800px on longest side
sips -Z 800 "$image_input" --out "$temp_compressed" &> /dev/null
if [ $? -eq 0 ]; then
image_to_encode="$temp_compressed"
>&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
fi
elif command -v convert &> /dev/null; then
# Linux: use ImageMagick
convert "$image_input" -resize 800x800\> "$temp_compressed" 2>/dev/null
if [ $? -eq 0 ]; then
image_to_encode="$temp_compressed"
>&2 echo "[vision.sh] Compressed large image ($(($file_size / 1024))KB) to avoid parameter limit"
fi
fi
fi
# Detect image format from file extension
extension="${image_to_encode##*.}"
extension_lower=$(echo "$extension" | tr '[:upper:]' '[:lower:]')
case "$extension_lower" in
jpg|jpeg)
mime_type="image/jpeg"
;;
png)
mime_type="image/png"
;;
gif)
mime_type="image/gif"
;;
webp)
mime_type="image/webp"
;;
*)
echo "{\"error\": \"Unsupported image format\", \"extension\": \"$extension\", \"supported\": [\"jpg\", \"jpeg\", \"png\", \"gif\", \"webp\"]}"
# Clean up temp file if exists
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
exit 1
;;
esac
# Encode image to base64
if command -v base64 &> /dev/null; then
# macOS and most Linux systems
base64_image=$(base64 -i "$image_to_encode" 2>/dev/null || base64 "$image_to_encode" 2>/dev/null)
else
echo '{"error": "base64 command not found", "help": "Please install base64 utility"}'
# Clean up temp file if exists
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
exit 1
fi
# Clean up temp compressed file
[ -n "$temp_compressed" ] && rm -f "$temp_compressed"
if [ -z "$base64_image" ]; then
echo "{\"error\": \"Failed to encode image to base64\", \"path\": \"$image_input\"}"
exit 1
fi
# Escape question for JSON (replace " with \")
escaped_question=$(echo "$question" | sed 's/"/\\"/g')
# Build JSON request body with base64 image
# Note: Using printf to avoid issues with special characters
request_body=$(cat <<EOF
{
"model": "$model",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "$escaped_question"
},
{
"type": "image_url",
"image_url": {
"url": "data:$mime_type;base64,$base64_image"
}
}
]
}
],
"max_tokens": 1000
}
EOF
)
fi
# Call OpenAI API
response=$(curl -sS --max-time 60 \
-X POST \
-H "Authorization: Bearer $OPENAI_API_KEY" \
-H "Content-Type: application/json" \
-d "$request_body" \
"$api_base/chat/completions" 2>&1)
curl_exit_code=$?
if [ $curl_exit_code -ne 0 ]; then
echo "{\"error\": \"Failed to call OpenAI API\", \"details\": \"$response\"}"
exit 1
fi
# Simple JSON validation - check if response starts with { or [
if [[ ! "$response" =~ ^[[:space:]]*[\{\[] ]]; then
echo "{\"error\": \"Invalid JSON response from API\", \"response\": \"$response\"}"
exit 1
fi
# Check for API error (look for "error" field in response)
if echo "$response" | grep -q '"error"[[:space:]]*:[[:space:]]*{'; then
# Extract error message if possible
error_msg=$(echo "$response" | grep -o '"message"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"message"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
if [ -z "$error_msg" ]; then
error_msg="Unknown API error"
fi
echo "{\"error\": \"OpenAI API error\", \"message\": \"$error_msg\", \"response\": $response}"
exit 1
fi
# Extract the content from the response
# The response structure is: choices[0].message.content
content=$(echo "$response" | grep -o '"content"[[:space:]]*:[[:space:]]*"[^"]*"' | sed 's/"content"[[:space:]]*:[[:space:]]*"\(.*\)"/\1/' | head -1)
# Extract usage information
prompt_tokens=$(echo "$response" | grep -o '"prompt_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
completion_tokens=$(echo "$response" | grep -o '"completion_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
total_tokens=$(echo "$response" | grep -o '"total_tokens"[[:space:]]*:[[:space:]]*[0-9]*' | grep -o '[0-9]*' | head -1)
# Build simplified response
if [ -n "$content" ]; then
# Unescape JSON content (basic unescaping)
content=$(echo "$content" | sed 's/\\n/\n/g' | sed 's/\\"/"/g')
cat <<EOF
{
"model": "$model",
"content": "$content",
"usage": {
"prompt_tokens": ${prompt_tokens:-0},
"completion_tokens": ${completion_tokens:-0},
"total_tokens": ${total_tokens:-0}
}
}
EOF
else
# If we can't extract content, return the full response
echo "$response"
fi