feat: 添加TTS语音配置检查脚本和优化播客生成器

refactor(podcast_generator): 改进音频生成和合并逻辑 - 添加多线程支持加速音频生成 - 优化JSON解析逻辑增强健壮性 - 改进音频文件合并为WAV格式 - 添加执行时间统计功能 docs(config): 更新语音配置文件和添加新语音 - 为所有语音添加usedname字段 - 添加新的语音配置和角色定义 - 更新API URL参数 chore: 更新.gitignore添加日志文件排除
2025-08-06 16:14:41 +08:00
parent cd528f407c
commit c72e64e529
8 changed files with 545 additions and 221 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 # 忽略 Python 缓存目录
 __pycache__/
-output/
+output/
+excalidraw.log
--- a/check/check_edgetts_voices.py
+++ b/check/check_edgetts_voices.py
@@ -0,0 +1,48 @@
+import json
+import requests
+import time
+
+def check_tts_voices():
+    config_file_path = "config/edge-tts.json"
+    base_url = "http://192.168.1.178:7899/tts"
+    test_text = "你好"
+    rate = 5 # Assuming 'r' means rate
+
+    try:
+        with open(config_file_path, 'r', encoding='utf-8') as f:
+            config_data = json.load(f)
+    except FileNotFoundError:
+        print(f"错误: 配置文件未找到，请检查路径: {config_file_path}")
+        return
+    except json.JSONDecodeError:
+        print(f"错误: 无法解析 JSON 文件: {config_file_path}")
+        return
+
+    voices = config_data.get('voices', [])
+    if not voices:
+        print("未在配置文件中找到任何声音（voices）。")
+        return
+
+    print(f"开始验证 {len(voices)} 个 TTS 语音...")
+    for voice in voices:
+        voice_code = voice.get('code')
+        voice_name = voice.get('name', '未知')
+        if voice_code:
+            url = f"{base_url}?t={test_text}&v={voice_code}&r={rate}"
+            print(f"正在测试语音: {voice_name} (Code: {voice_code}) - URL: {url}")
+            try:
+                response = requests.get(url, timeout=10) # 10秒超时
+                if response.status_code == 200:
+                    print(f"  ✅ {voice_name} (Code: {voice_code}): 可用")
+                else:
+                    print(f"  ❌ {voice_name} (Code: {voice_code}): 不可用, 状态码: {response.status_code}")
+            except requests.exceptions.RequestException as e:
+                print(f"  ❌ {voice_name} (Code: {voice_code}): 请求失败, 错误: {e}")
+            time.sleep(0.1) # 短暂延迟，避免请求过快
+        else:
+            print(f"跳过一个缺少 'code' 字段的语音条目: {voice}")
+
+    print("TTS 语音验证完成。")
+
+if __name__ == "__main__":
+    check_tts_voices()
--- a/config/edge-tts.json
+++ b/config/edge-tts.json
@@ -5,273 +5,439 @@
      "alias": "晓晓",
      "code": "zh-CN-XiaoxiaoNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓晓"
    },
    {
      "name": "Yunxi",
      "alias": "云希",
      "code": "zh-CN-YunxiNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云希"
    },
    {
      "name": "Yunjian",
      "alias": "云健",
      "code": "zh-CN-YunjianNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云健"
    },
    {
      "name": "Xiaoyi",
      "alias": "晓伊",
      "code": "zh-CN-XiaoyiNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓伊"
    },
    {
      "name": "Yunyang",
      "alias": "云扬",
      "code": "zh-CN-YunyangNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云扬"
    },
    {
      "name": "Xiaochen",
      "alias": "晓辰",
      "code": "zh-CN-XiaochenNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓辰"
    },
    {
      "name": "Xiaochen Multilingual",
      "alias": "晓辰 多语言",
      "code": "zh-CN-XiaochenMultilingualNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓辰"
    },
    {
      "name": "Xiaohan",
      "alias": "晓涵",
      "code": "zh-CN-XiaohanNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓涵"
    },
    {
      "name": "Xiaomeng",
      "alias": "晓梦",
      "code": "zh-CN-XiaomengNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓梦"
    },
    {
      "name": "Xiaomo",
      "alias": "晓墨",
      "code": "zh-CN-XiaomoNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓墨"
    },
    {
      "name": "Xiaoqiu",
      "alias": "晓秋",
      "code": "zh-CN-XiaoqiuNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓秋"
    },
    {
      "name": "Xiaorou",
      "alias": "晓柔",
      "code": "zh-CN-XiaorouNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓柔"
    },
    {
      "name": "Xiaorui",
      "alias": "晓睿",
      "code": "zh-CN-XiaoruiNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓睿"
    },
    {
      "name": "Xiaoshuang",
      "alias": "晓双",
      "code": "zh-CN-XiaoshuangNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓双"
    },
    {
      "name": "Xiaoxiao Dialects",
      "alias": "晓晓 方言",
      "code": "zh-CN-XiaoxiaoDialectsNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓晓"
    },
    {
      "name": "Xiaoxiao Multilingual",
      "alias": "晓晓 多语言",
      "code": "zh-CN-XiaoxiaoMultilingualNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓晓"
    },
    {
      "name": "Xiaoyan",
      "alias": "晓颜",
      "code": "zh-CN-XiaoyanNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓颜"
    },
    {
      "name": "Xiaoyou",
      "alias": "晓悠",
      "code": "zh-CN-XiaoyouNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓悠"
    },
    {
      "name": "Xiaoyu Multilingual",
      "alias": "晓宇 多语言",
      "code": "zh-CN-XiaoyuMultilingualNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓宇"
    },
    {
      "name": "Xiaozhen",
      "alias": "晓甄",
      "code": "zh-CN-XiaozhenNeural",
      "locale": "zh-CN",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓甄"
    },
    {
      "name": "Yunfeng",
      "alias": "云枫",
      "code": "zh-CN-YunfengNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云枫"
    },
    {
      "name": "Yunhao",
      "alias": "云皓",
      "code": "zh-CN-YunhaoNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云皓"
    },
    {
      "name": "Yunjie",
      "alias": "云杰",
      "code": "zh-CN-YunjieNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云杰"
    },
    {
      "name": "Yunxia",
      "alias": "云夏",
      "code": "zh-CN-YunxiaNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云夏"
    },
    {
      "name": "Yunxiao Multilingual",
-      "alias": "Yunxiao Multilingual",
+      "alias": "云晓 多语言",
      "code": "zh-CN-YunxiaoMultilingualNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云晓"
    },
    {
      "name": "Yunye",
      "alias": "云野",
      "code": "zh-CN-YunyeNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云野"
    },
    {
      "name": "Yunyi Multilingual",
      "alias": "云逸 多语言",
      "code": "zh-CN-YunyiMultilingualNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云逸"
    },
    {
      "name": "Yunze",
      "alias": "云泽",
      "code": "zh-CN-YunzeNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云泽"
+    },
+    {
+      "name": "Xiaochen",
+      "alias": "小陈",
+      "code": "zh-CN-Xiaochen:DragonHDFlashLatestNeural",
+      "locale": "zh-CN",
+      "gender": "Female",
+      "usedname": "小陈"
+    },
+    {
+      "name": "Xiaoxiao",
+      "alias": "晓晓",
+      "code": "zh-CN-Xiaoxiao:DragonHDFlashLatestNeural",
+      "locale": "zh-CN",
+      "gender": "Female",
+      "usedname": "晓晓"
+    },
+    {
+      "name": "Xiaoxiao2",
+      "alias": "晓晓2",
+      "code": "zh-CN-Xiaoxiao2:DragonHDFlashLatestNeural",
+      "locale": "zh-CN",
+      "gender": "Female",
+      "usedname": "晓晓2"
    },
    {
      "name": "Yunfan Multilingual",
-      "alias": "Yunfan Multilingual",
+      "alias": "云帆 多语言",
      "code": "zh-CN-YunfanMultilingualNeural",
      "locale": "zh-CN",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云帆"
+    },
+    {
+      "name": "Yunxiao",
+      "alias": "云晓",
+      "code": "zh-CN-Yunxiao:DragonHDFlashLatestNeural",
+      "locale": "zh-CN",
+      "gender": "Male",
+      "usedname": "云晓"
+    },
+    {
+      "name": "Yunye",
+      "alias": "云野",
+      "code": "zh-CN-Yunye:DragonHDFlashLatestNeural",
+      "locale": "zh-CN",
+      "gender": "Male",
+      "usedname": "云野"
+    },
+    {
+      "name": "Yunyi",
+      "alias": "云逸",
+      "code": "zh-CN-Yunyi:DragonHDFlashLatestNeural",
+      "locale": "zh-CN",
+      "gender": "Male",
+      "usedname": "云逸"
+    },
+    {
+      "name": "Xiaochen",
+      "alias": "小陈",
+      "code": "zh-CN-Xiaochen:DragonHDLatestNeural",
+      "locale": "zh-CN",
+      "gender": "Female",
+      "usedname": "小陈"
+    },
+    {
+      "name": "Yunfan Dragon HD Latest",
+      "alias": "云帆",
+      "code": "zh-CN-Yunfan:DragonHDLatestNeural",
+      "locale": "zh-CN",
+      "gender": "Male",
+      "usedname": "云帆"
    },
    {
      "name": "Yunqi",
      "alias": "云奇 广西",
      "code": "zh-CN-guangxi-YunqiNeural",
      "locale": "zh-CN-guangxi",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云奇"
    },
    {
      "name": "Yundeng",
      "alias": "云登",
      "code": "zh-CN-henan-YundengNeural",
      "locale": "zh-CN-henan",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云登"
    },
    {
      "name": "Xiaobei",
      "alias": "晓北 辽宁",
      "code": "zh-CN-liaoning-XiaobeiNeural",
      "locale": "zh-CN-liaoning",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓北"
    },
    {
      "name": "Yunbiao",
      "alias": "云彪 辽宁",
      "code": "zh-CN-liaoning-YunbiaoNeural",
      "locale": "zh-CN-liaoning",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云彪"
    },
    {
      "name": "Xiaoni",
      "alias": "晓妮",
      "code": "zh-CN-shaanxi-XiaoniNeural",
      "locale": "zh-CN-shaanxi",
-      "gender": "Female"
+      "gender": "Female",
+      "usedname": "晓妮"
    },
    {
      "name": "Yunxiang",
      "alias": "云翔",
      "code": "zh-CN-shandong-YunxiangNeural",
      "locale": "zh-CN-shandong",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云翔"
    },
    {
      "name": "Yunxi",
      "alias": "云希 四川",
      "code": "zh-CN-sichuan-YunxiNeural",
      "locale": "zh-CN-sichuan",
-      "gender": "Male"
+      "gender": "Male",
+      "usedname": "云希"
+    },
+    {
+      "name": "HiuMaan",
+      "alias": "曉曼",
+      "code": "zh-HK-HiuMaanNeural",
+      "locale": "zh-HK",
+      "gender": "Female",
+      "usedname": "曉曼"
+    },
+    {
+      "name": "WanLung",
+      "alias": "雲龍",
+      "code": "zh-HK-WanLungNeural",
+      "locale": "zh-HK",
+      "gender": "Male",
+      "usedname": "雲龍"
+    },
+    {
+      "name": "HiuGaai",
+      "alias": "曉佳",
+      "code": "zh-HK-HiuGaaiNeural",
+      "locale": "zh-HK",
+      "gender": "Female",
+      "usedname": "曉佳"
+    },
+    {
+      "name": "HsiaoChen",
+      "alias": "曉臻",
+      "code": "zh-TW-HsiaoChenNeural",
+      "locale": "zh-TW",
+      "gender": "Female",
+      "usedname": "曉臻"
+    },
+    {
+      "name": "YunJhe",
+      "alias": "雲哲",
+      "code": "zh-TW-YunJheNeural",
+      "locale": "zh-TW",
+      "gender": "Male",
+      "usedname": "雲哲"
+    },
+    {
+      "name": "HsiaoYu",
+      "alias": "曉雨",
+      "code": "zh-TW-HsiaoYuNeural",
+      "locale": "zh-TW",
+      "gender": "Female",
+      "usedname": "曉雨"
+    },
+    {
+      "name": "Xiaotong",
+      "alias": "晓彤",
+      "code": "wuu-CN-XiaotongNeural",
+      "locale": "wuu-CN",
+      "gender": "Female",
+      "usedname": "晓彤"
+    },
+    {
+      "name": "Yunzhe",
+      "alias": "云哲",
+      "code": "wuu-CN-YunzheNeural",
+      "locale": "wuu-CN",
+      "gender": "Male",
+      "usedname": "云哲"
    },
    {
      "name": "XiaoMin",
      "alias": "晓敏",
      "code": "yue-CN-XiaoMinNeural",
+      "locale": "yue-CN",
      "gender": "Female",
-      "locale": "yue-CN"
+      "usedname": "晓敏"
    },
    {
      "name": "YunSong",
      "alias": "云松",
      "code": "yue-CN-YunSongNeural",
+      "locale": "yue-CN",
      "gender": "Male",
-      "locale": "yue-CN"
+      "usedname": "云松"
    }
  ],
-  "apiUrl": "http://192.168.1.178:7899/tts?t={{text}}&v={{voiceCode}}",
+  "apiUrl": "http://192.168.1.178:7899/tts?t={{text}}&v={{voiceCode}}&r=5",
  "podUsers": [
-    "zh-CN-sichuan-YunxiNeural",
-    "zh-CN-liaoning-XiaobeiNeural",
-    "yue-CN-YunSongNeural"
+    {"role": "酒馆主理人", "code": "zh-CN-Yunyi:DragonHDFlashLatestNeural"},
+    {"role": "科技爱好者", "code": "zh-CN-Xiaochen:DragonHDFlashLatestNeural"},
+    {"role": "AI从业人员", "code": "zh-CN-liaoning-YunbiaoNeural"}
  ],
  "turnPattern": "random"
 }
--- a/config/index-tts.json
+++ b/config/index-tts.json
@@ -0,0 +1,51 @@
+{
+  "voices": [
+    {
+      "name": "Xiaolin",
+      "alias": "林夕",
+      "code": "zh-CN-XiaolinIndex",
+      "locale": "zh-CN",
+      "gender": "Female",
+      "usedname": "林夕"
+    },
+    {
+      "name": "Yunzhe",
+      "alias": "苏哲",
+      "code": "zh-CN-YunzheIndex",
+      "locale": "zh-CN",
+      "gender": "Male",
+      "usedname": "苏哲"
+    },
+    {
+      "name": "HeXi",
+      "alias": "何夕2077",
+      "code": "zh-CN-HeXiIndex",
+      "locale": "zh-CN",
+      "gender": "Male",
+      "usedname": "何夕"
+    },
+    {
+      "name": "Datong",
+      "alias": "大同",
+      "code": "zh-CN-DatongIndex",
+      "locale": "zh-CN",
+      "gender": "Male",
+      "usedname": "大同"
+    },
+    {
+      "name": "Daibei",
+      "alias": "大比",
+      "code": "zh-CN-DaibeiIndex",
+      "locale": "zh-CN",
+      "gender": "Male",
+      "usedname": "大比"
+    }
+  ],
+  "apiUrl": "http://192.168.1.232:7899/synthesize?text={{text}}&server_audio_prompt_path={{voiceCode}}",
+  "podUsers": [
+    {"role": "节目主理人", "code": "zh-CN-YunzheIndex"},
+    {"role": "科技爱好者", "code": "zh-CN-XiaolinIndex"},
+    {"role": "独立音乐人", "code": "zh-CN-DatongIndex"}
+  ],
+  "turnPattern": "random"
+}
--- a/podcast_generator.py
+++ b/podcast_generator.py
@@ -12,6 +12,7 @@ import uuid # For generating unique filenames for temporary audio files
 from datetime import datetime
 from openai_cli import OpenAICli # Moved to top for proper import
 import urllib.parse # For URL encoding
+import re # For regular expression operations

 # Global configuration
 output_dir = "output"
@@ -61,21 +62,27 @@ def generate_speaker_id_text(pod_users, voices_list):
    voice_map = {voice.get("code"): voice for voice in voices_list if voice.get("code")}
    
    speaker_info = []
-    for speaker_id, pod_user_code in enumerate(pod_users):
+    for speaker_id, pod_user in enumerate(pod_users):
+        pod_user_code = pod_user.get("code")
+        role = pod_user.get("role", "") # Default to "未知角色" if role is not provided
+
        found_name = None
        voice = voice_map.get(pod_user_code)
        if voice:
-            found_name = voice.get("alias") or voice.get("name")
+            found_name = voice.get("usedname") or voice.get("alias") or voice.get("name")
        
        if found_name:
-            speaker_info.append(f"speaker_id={speaker_id}的名叫{found_name}")
+            if role:
+                speaker_info.append(f"speaker_id={speaker_id}的名叫{found_name}，是一个{role}")
+            else:
+                speaker_info.append(f"speaker_id={speaker_id}的名叫{found_name}")
        else:
            raise ValueError(f"语音code '{pod_user_code}' (speaker_id={speaker_id}) 未找到对应名称或alias。请检查 config/edge-tts.json 中的 voices 配置。")
            
-    return "，".join(speaker_info)
+    return "。".join(speaker_info) + "。"

 def merge_audio_files():
-    output_audio_filename = f"podcast_{int(time.time())}.mp3"
+    output_audio_filename = f"podcast_{int(time.time())}.wav"
    # Use ffmpeg to concatenate audio files
    # Check if ffmpeg is available
    try:
@@ -92,7 +99,9 @@ def merge_audio_files():
            "-f", "concat",
            "-safe", "0",
            "-i", os.path.basename(file_list_path),
-            "-c", "copy",
+            "-acodec", "pcm_s16le",
+            "-ar", "44100",
+            "-ac", "2",
            output_audio_filename
        ]
        # Execute ffmpeg from the output_dir to correctly resolve file paths in file_list.txt
@@ -127,6 +136,7 @@ def main():
    parser.add_argument("--api-key", help="OpenAI API key.")
    parser.add_argument("--base-url", help="OpenAI API base URL.")
    parser.add_argument("--model", help="OpenAI model to use (e.g., gpt-3.5-turbo).")
+    parser.add_argument("--threads", type=int, default=1, help="Number of threads to use for audio generation (default: 1).")
    args = parser.parse_args()

    print("Podcast Generation Script")
@@ -153,8 +163,8 @@ def main():

    # Step 2: Read prompt files
    input_prompt = read_file_content('input.txt')
-    overview_prompt = read_file_content('prompt-overview.txt')
-    original_podscript_prompt = read_file_content('prompt-podscript.txt')
+    overview_prompt = read_file_content('prompt/prompt-overview.txt')
+    original_podscript_prompt = read_file_content('prompt/prompt-podscript.txt')

    # 从 input_prompt 中提取自定义内容
    custom_content = ""
@@ -202,28 +212,46 @@ def main():
        openai_client_podscript = OpenAICli(api_key=api_key, base_url=base_url, model=model, system_message=podscript_prompt)
        podscript_response_generator = openai_client_podscript.chat_completion(messages=[{"role": "user", "content": overview_content}])
        podscript_json_str = "".join([chunk.choices[0].delta.content for chunk in podscript_response_generator if chunk.choices and chunk.choices[0].delta.content])
-        
+        # try:
+        #     output_script_filename = os.path.join(output_dir, f"podcast_script_{int(time.time())}.json")
+        #     with open(output_script_filename, 'w', encoding='utf-8') as f:
+        #         json.dump(podscript_json_str, f, ensure_ascii=False, indent=4)
+        #     print(f"Podcast script saved to {output_script_filename}")
+        # except Exception as e:
+        #     print(f"Error saving podcast script to file: {e}")
+        #     sys.exit(1)
+
        # Attempt to parse the JSON string. OpenAI sometimes returns extra text.
-        try:
-            # Find the first and last curly braces to extract valid JSON
-            json_start = podscript_json_str.find('{')
-            json_end = podscript_json_str.rfind('}') + 1
-            if json_start != -1 and json_end != -1 and json_end > json_start:
-                valid_json_str = podscript_json_str[json_start:json_end]
-                podcast_script = json.loads(valid_json_str)
-            else:
-                raise ValueError("Could not find valid JSON object in response.")
-        except json.JSONDecodeError as e:
-            print(f"Error decoding podcast script JSON: {e}")
-            print(f"Raw response: {podscript_json_str}")
-            sys.exit(1)
-        except ValueError as e:
-            print(f"Error processing podcast script response: {e}")
+        podcast_script = None
+        decoder = json.JSONDecoder()
+        idx = 0
+        valid_json_str = ""
+        while idx < len(podscript_json_str):
+            try:
+                obj, end = decoder.raw_decode(podscript_json_str[idx:])
+                # Check if this object is the expected podcast_script
+                if isinstance(obj, dict) and "podcast_transcripts" in obj:
+                    podcast_script = obj
+                    valid_json_str = podscript_json_str[idx : idx + end] # Capture the exact valid JSON string
+                    break # Found the desired JSON, stop searching
+                idx += end # Move to the end of the current JSON object
+            except json.JSONDecodeError:
+                # If decoding fails, advance index by one and continue
+                idx += 1
+                # Optionally, skip to the next potential JSON start if it's far away
+                next_brace = podscript_json_str.find('{', idx)
+                if next_brace != -1:
+                    idx = next_brace
+                else:
+                    break # No more braces, no more JSON to find
+
+        if podcast_script is None:
+            print(f"Error: Could not find a valid podcast script JSON object with 'podcast_transcripts' key in response.")
            print(f"Raw response: {podscript_json_str}")
            sys.exit(1)

-        print("\nGenerated Podcast Script Length:"+ str(len(podcast_script.get("podcast_transcripts"))))
-        print(valid_json_str[:100])
+        print("\nGenerated Podcast Script Length:"+ str(len(podcast_script.get("podcast_transcripts") or [])))
+        print(valid_json_str[:100] + "...") # Print beginning of the *actual* parsed JSON
        if not podcast_script.get("podcast_transcripts"):
            print("Warning: 'podcast_transcripts' array is empty or not found in the generated script. Nothing to convert to audio.")
            sys.exit(0) # Exit gracefully if no transcripts to process
@@ -234,10 +262,9 @@ def main():

    # Step 7: Parse podcast script and generate audio
    os.makedirs(output_dir, exist_ok=True) # Create output directory if it doesn't exist
-    audio_files = [] # List to store paths of generated audio files
-
-    print("\nGenerating audio files...")
-    for i, item in enumerate(podcast_script.get("podcast_transcripts", [])):
+    
+    def generate_audio_for_item(item, index):
+        """Generate audio for a single podcast transcript item."""
        speaker_id = item.get("speaker_id")
        dialog = item.get("dialog")

@@ -245,20 +272,24 @@ def main():
        # Assuming speaker_id corresponds to the index in the 'person' array
        voice_code = None
        if config_data and "podUsers" in config_data and 0 <= speaker_id < len(config_data["podUsers"]):
-            voice_code = config_data["podUsers"][speaker_id]
+            pod_user_entry = config_data["podUsers"][speaker_id]
+            voice_code = pod_user_entry.get("code")
        
        if not voice_code:
            print(f"Warning: No voice code found for speaker_id {speaker_id}. Skipping this dialog.")
-            continue
+            return None

        # Replace placeholders in apiUrl
        # URL encode the dialog before replacing {{text}}
+        # 移除指定标点符号，只保留逗号，句号，感叹号
+        dialog = re.sub(r'[^\w\s\-,，.。?？!！\u4e00-\u9fa5]', '', dialog)
+        print(f"dialog: {dialog}")
        encoded_dialog = urllib.parse.quote(dialog)
        api_url = config_data.get("apiUrl", "").replace("{{text}}", encoded_dialog).replace("{{voiceCode}}", voice_code)
        
        if not api_url:
            print(f"Warning: apiUrl not found in config. Skipping dialog for speaker_id {speaker_id}.")
-            continue
+            return None

        try:
            print(f"Calling TTS API for speaker {speaker_id} with voice {voice_code}...")
@@ -270,12 +301,41 @@ def main():
            with open(temp_audio_file, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
-            audio_files.append(temp_audio_file)
            print(f"Generated {os.path.basename(temp_audio_file)}")
+            return temp_audio_file

        except requests.exceptions.RequestException as e:
            print(f"Error calling TTS API for speaker {speaker_id} ({voice_code}): {e}")
-            continue
+            return None
+    
+    print("\nGenerating audio files...")
+    transcripts = podcast_script.get("podcast_transcripts", [])
+    
+    # Use ThreadPoolExecutor for multi-threading audio generation
+    from concurrent.futures import ThreadPoolExecutor, as_completed
+    
+    # Create a dictionary to hold results with their indices
+    audio_files_dict = {}
+    
+    with ThreadPoolExecutor(max_workers=args.threads) as executor:
+        # Submit all tasks with their indices
+        future_to_index = {
+            executor.submit(generate_audio_for_item, item, i): i
+            for i, item in enumerate(transcripts)
+        }
+        
+        # Collect results and place them in the correct order
+        for future in as_completed(future_to_index):
+            index = future_to_index[future]
+            try:
+                result = future.result()
+                if result:
+                    audio_files_dict[index] = result
+            except Exception as e:
+                print(f"Error generating audio for item {index}: {e}")
+    
+    # Convert dictionary to list in the correct order
+    audio_files = [audio_files_dict[i] for i in sorted(audio_files_dict.keys())]
    
    print(f"\nFinished generating individual audio files. Total files: {len(audio_files)}")
    """
@@ -301,7 +361,13 @@ def main():
        print(f.read())


-    
 if __name__ == "__main__":
+    start_time = time.time() # Record the start time
+    
    main()
-    merge_audio_files()
+    merge_audio_files()
+
+    end_time = time.time() # Record the end time
+    execution_time = end_time - start_time # Calculate total execution time
+    print(f"\nTotal execution time: {execution_time:.2f} seconds")
+    
--- a/prompt-podscript.txt
+++ b/prompt-podscript.txt
@@ -1,136 +0,0 @@
-<podcast_generation_system>
-You are a master podcast scriptwriter, adept at transforming diverse input content into a lively, engaging, and natural-sounding conversation between multiple distinct podcast hosts. Your primary objective is to craft authentic, flowing dialogue that captures the spontaneity and chemistry of a real group discussion, completely avoiding any hint of robotic scripting or stiff formality. Think dynamic group interplay, not just information delivery.
-
-<input>
-  <!-- Podcast settings provide high-level configuration for the script generation. -->
-  <podcast_settings>
-    <!-- Define the total number of speakers in the podcast. Minimum 1. -->
-    <num_speakers>{{numSpeakers}}</num_speakers> 
-    <!-- Define the speaking order. Options: "sequential" or "random". -->
-    <turn_pattern>{{turnPattern}}</turn_pattern> 
-  </podcast_settings>
-  
-  <!-- The source_content contains the factual basis for the podcast discussion. -->
-  <source_content>
-    A block of text containing the information to be discussed. This could be research findings, an article summary, a detailed outline, user chat history related to the topic, or any other relevant raw information.
-  </source_content>
-</input>
-
-<output_format>
-A JSON object containing the podcast transcript with alternating speakers according to the specified settings.
-{{
-  "podcast_transcripts": [
-    {{
-      "speaker_id": 0,
-      "dialog": "Speaker 0 dialog here"
-    }},
-    {{
-      "speaker_id": 1,
-      "dialog": "Speaker 1 dialog here"
-    }},
-    {{
-      "speaker_id": 2,
-      "dialog": "Speaker 2 dialog here"
-    }},
-    // ... conversation continues
-  ]
-}}
-</output_format>
-
-<guidelines>
-1.  **Establish Distinct & Consistent Host Personas for N Speakers:**
-    *   **Create Personas Based on `num_speakers`:** For the number of speakers specified, create a unique and consistent persona for each.
-    *   **Speaker 0 (Lead Host/Moderator):** This speaker should always act as the primary host. They drive the conversation, introduce segments, pose key questions, and help summarize takeaways. Their tone is guiding and engaging.
-    *   **Other Speakers (Co-Hosts):** For `speaker_1`, `speaker_2`, etc., create complementary personas that enhance the discussion. Examples of personas include:
-        *   **The Expert:** Provides deep, factual insights from the source content.
-        *   **The Curious Newcomer:** Asks clarifying questions that a listener might have, acting as an audience surrogate.
-        *   **The Practical Skeptic:** Grounds the conversation by questioning assumptions or focusing on real-world implications.
-        *   **The Enthusiast:** Brings energy, shares personal anecdotes, and expresses excitement about the topic.
-    *   **Consistency is Key:** Ensure each speaker maintains their distinct voice, vocabulary, and perspective throughout the script. Their interaction should feel like a genuine, established group dynamic.
-
-2.  **Adhere to the Specified Turn Pattern:**
-    *   **If `turn_pattern` is "sequential":** The speakers should talk in a fixed, repeating order (e.g., 0 -> 1 -> 2 -> 0 -> 1 -> 2...). Maintain this strict sequence throughout the script.
-    *   **If `turn_pattern` is "random":** The speaking order should be more dynamic and less predictable, mimicking a real group conversation. A speaker might have two short turns in a row to elaborate, another might interject, or one might ask a question that a different speaker answers. Ensure a **balanced distribution** of speaking time over the entire podcast, avoiding any single speaker dominating or being left out for too long.
-
-3.  **Craft Natural & Dynamic Group Dialogue:**
-    *   **Emulate Real Conversation:** Use contractions (e.g., "don't", "it's"), interjections ("Oh!", "Wow!", "Hmm"), and discourse markers ("you know", "right?", "well").
-    *   **Foster Group Interaction:** Write dialogue where speakers genuinely react to one another. They should build on points made by *any* other speaker ("Exactly, and to add to what [Speaker X] said..."), ask follow-up questions to the group, express agreement/disagreement respectfully, and show active listening. The conversation should not be a series of 1-on-1s with the host, but a true group discussion.
-    *   **Vary Rhythm & Pace:** Mix short, punchy lines with longer, more explanatory ones. The rhythm should feel spontaneous and collaborative.
-
-4.  **Structure for Flow and Listener Engagement:**
-    *   **Natural Beginning:** Start with dialogue that flows naturally as if the introduction has just finished.
-    *   **Logical Progression & Signposting:** The lead host (`speaker_0`) should guide the listener through the information smoothly, using clear transitions to link different ideas.
-    *   **Meaningful Conclusion:** End by summarizing the key takeaways from the group discussion, reinforcing the core message. Close with a final thought or a lingering question for the audience.
-
-5.  **Integrate Source Content Seamlessly & Accurately:**
-    *   **Translate, Don't Recite:** Rephrase information from the `<source_content>` into conversational language suitable for each host's persona.
-    *   **Explain & Contextualize:** Use analogies, examples, and clarifying questions among the hosts to break down complex ideas.
-    *   **Weave Information Naturally:** Integrate facts and data from the source within the group dialogue, not as standalone, undigested blocks.
-
-6.  **Length & Pacing:**
-    *   **Target Duration:** Create a transcript that would result in approximately 5-6 minutes of audio (around 800-1000 words total).
-    *   **Balanced Speaking Turns:** Aim for a natural conversational flow among speakers rather than extended monologues by one person. Prioritize the most important information from the source content.
-</guidelines>
-
-<examples>
-<!-- Example for a 3-person podcast with a 'random' turn pattern -->
-<input>
-  <podcast_settings>
-    <num_speakers>3</num_speakers>
-    <turn_pattern>random</turn_pattern>
-  </podcast_settings>
-  <source_content>
-    Quantum computing uses quantum bits or qubits which can exist in multiple states simultaneously due to superposition. This is different from classical bits (0 or 1). Think of it like a spinning coin. This allows for massive parallel computation.
-  </source_content>
-</input>
-<output_format>
-{{
-"podcast_transcripts": [
-  {{
-    "speaker_id": 0,
-    "dialog": "Alright team, today we're tackling a big one: Quantum Computing. I know a lot of listeners have been asking, so let's try to demystify it a bit."
-  }},
-  {{
-    "speaker_id": 2,
-    "dialog": "Yes! I'm so excited for this. But honestly, every time I read about it, it feels like science fiction. Where do we even start?"
-  }},
-  {{
-    "speaker_id": 1,
-    "dialog": "That's the perfect place to start, actually. Let's ground it. Forget the 'quantum' part for a second. We all know regular computers use 'bits', right? They're tiny switches, either a zero or a one. On or off. Simple."
-  }},
-  {{
-    "speaker_id": 0,
-    "dialog": "Right, the basic building block of all digital information. So, how do 'qubits'—the quantum version—change the game?"
-  }},
-  {{
-    "speaker_id": 1,
-    "dialog": "This is where the magic happens. A qubit isn't just a zero OR a one. Thanks to a principle called superposition, it can be zero, one, or both at the same time."
-  }},
-  {{
-    "speaker_id": 2,
-    "dialog": "Okay, hold on. 'Both at the same time'? My brain just short-circuited. How is that possible?"
-  }},
-  {{
-    "speaker_id": 1,
-    "dialog": "The classic analogy is a spinning coin. While it's in the air, before it lands, is it heads or tails? It's in a state of both possibilities. A qubit is like that spinning coin, holding multiple values at once."
-  }},
-  {{
-    "speaker_id": 0,
-    "dialog": "Ah, that's a great way to put it. So that 'spinning coin' state is what allows them to be so much more powerful, for massive parallel calculations?"
-  }},
-  {{
-    "speaker_id": 1,
-    "dialog": "Exactly. Because one qubit can hold multiple values, a set of them can explore a huge number of possibilities simultaneously, instead of one by one like a classical computer."
-  }},
-  {{
-    "speaker_id": 2,
-    "dialog": "Wow. Okay, that clicks. It's not just faster, it's a completely different way of thinking about problem-solving."
-  }}
-]
-}}
-</output_format>
-
-Transform the source material into a lively and engaging podcast conversation based on the provided settings. Craft dialogue that showcases authentic group chemistry and natural interaction. Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates and entertains the listener.
-</podcast_generation_system>
-
-No explanatory text，Make sure the input language is set as the output language！
--- a/prompt/prompt-overview.txt
+++ b/prompt/prompt-overview.txt
@@ -60,6 +60,7 @@
            - Structure content with clear hierarchy and organization
            - Avoid jargon and overly technical language
            - Include transition sentences between sections
+            - Make sure the input language is set as the output language
        </style>
    </output_format>

@@ -85,6 +86,4 @@
    </length_guidelines>
    
    Now, create a summary of the following document:
-</INSTRUCTIONS>
-
-Make sure the input language is set as the output language！
+</INSTRUCTIONS>
--- a/prompt/prompt-podscript.txt
+++ b/prompt/prompt-podscript.txt
@@ -0,0 +1,129 @@
+<podcast_generation_system>
+You are a master podcast scriptwriter, adept at transforming diverse input content into a lively, engaging, and natural-sounding conversation between multiple distinct podcast hosts. Your primary objective is to craft authentic, flowing dialogue that captures the spontaneity and chemistry of a real group discussion, completely avoiding any hint of robotic scripting or stiff formality. Think dynamic group interplay, not just information delivery.
+
+<input>
+  <!-- Podcast settings provide high-level configuration for the script generation. -->
+  <podcast_settings>
+    <!-- Define the total number of speakers in the podcast. Minimum 1. -->
+    <num_speakers>{{numSpeakers}}</num_speakers> 
+    <!-- Define the speaking order. Options: "sequential" or "random". -->
+    <turn_pattern>{{turnPattern}}</turn_pattern> 
+  </podcast_settings>
+  
+  <!-- The source_content contains the factual basis for the podcast discussion. -->
+  <source_content>
+    A block of text containing the information to be discussed. This could be research findings, an article summary, a detailed outline, user chat history related to the topic, or any other relevant raw information.
+  </source_content>
+</input>
+
+<guidelines>
+
+1. **Establish Distinct & Consistent Host Personas for N Speakers:**
+   
+   * **Create Personas Based on `num_speakers`:** For the number of speakers specified, create a unique and consistent persona for each.
+   * **Speaker 0 (Lead Host/Moderator):** This speaker should always act as the primary host. They drive the conversation, introduce segments, pose key questions, and help summarize takeaways. Their tone is guiding and engaging.
+   * **Other Speakers (Co-Hosts):** For `speaker_1`, `speaker_2`, etc., create complementary personas that enhance the discussion. Examples of personas include:
+     * **The Expert:** Provides deep, factual insights from the source content.
+     * **The Curious Newcomer:** Asks clarifying questions that a listener might have, acting as an audience surrogate.
+     * **The Practical Skeptic:** Grounds the conversation by questioning assumptions or focusing on real-world implications.
+     * **The Enthusiast:** Brings energy, shares personal anecdotes, and expresses excitement about the topic.
+   * **Consistency is Key:** Ensure each speaker maintains their distinct voice, vocabulary, and perspective throughout the script. Their interaction should feel like a genuine, established group dynamic.
+   
+2. **Adhere to the Specified Turn Pattern:**
+   
+   * **If `turn_pattern` is "sequential":** The speakers should talk in a fixed, repeating order (e.g., 0 -> 1 -> 2 -> 0 -> 1 -> 2...). Maintain this strict sequence throughout the script.
+   * **If `turn_pattern` is "random":** The speaking order should be more dynamic and less predictable, mimicking a real group conversation. A speaker might have two short turns in a row to elaborate, another might interject, or one might ask a question that a different speaker answers. Ensure a **balanced distribution** of speaking time over the entire podcast, avoiding any single speaker dominating or being left out for too long.
+   
+3. **Craft Natural & Dynamic Group Dialogue:**
+   
+   * **Emulate Real Conversation:** Use contractions (e.g., "don't", "it's"), interjections ("Oh!", "Wow!", "Hmm"), and discourse markers ("you know", "right?", "well").Use common modal particles and pause words.
+   * **Foster Group Interaction:** Write dialogue where speakers genuinely react to one another. They should build on points made by *any* other speaker ("Exactly, and to add to what [Speaker X] said..."), ask follow-up questions to the group, express agreement/disagreement respectfully, and show active listening. The conversation should not be a series of 1-on-1s with the host, but a true group discussion.
+   * **Vary Rhythm & Pace:** Mix short, punchy lines with longer, more explanatory ones. The rhythm should feel spontaneous and collaborative.
+   
+4. **Structure for Flow and Listener Engagement:**
+   
+   * **Natural Beginning:** Start with dialogue that flows naturally as if the introduction has just finished.
+   * **Logical Progression & Signposting:** The lead host (`speaker_0`) should guide the listener through the information smoothly, using clear transitions to link different ideas.
+   * **Meaningful Conclusion:** End by summarizing the key takeaways from the group discussion, reinforcing the core message. Close with a final thought or a lingering question for the audience.
+   
+5. **Integrate Source Content Seamlessly & Accurately:**
+   
+   * **Translate, Don't Recite:** Rephrase information from the `<source_content>` into conversational language suitable for each host's persona.
+   * **Explain & Contextualize:** Use analogies, examples, and clarifying questions among the hosts to break down complex ideas.
+   * **Weave Information Naturally:** Integrate facts and data from the source within the group dialogue, not as standalone, undigested blocks.
+   
+6. **Length & Pacing:**
+   
+   * **Target Duration:** Create a transcript that would result in approximately 5-6 minutes of audio (around 800-1000 words total).
+   * **Balanced Speaking Turns:** Aim for a natural conversational flow among speakers rather than extended monologues by one person. Prioritize the most important information from the source content.
+
+7. **Personalized  & Output:**
+   
+   * **Output Format:** No explanatory text，Make sure the input language is set as the output language！
+   * **Begin Format:** After the opening remarks, introduce each guest who will participate in the discussion.
+   * **End Format:** Before concluding, review and summarize the previous speeches, which are concise, concise, powerful and thought-provoking.
+
+</guidelines>
+
+<examples>
+<!-- Example for a 3-person podcast with a 'random' turn pattern -->
+<input>
+  <podcast_settings>
+    <num_speakers>3</num_speakers>
+    <turn_pattern>random</turn_pattern>
+  </podcast_settings>
+  <source_content>
+    Quantum computing uses quantum bits or qubits which can exist in multiple states simultaneously due to superposition. This is different from classical bits (0 or 1). Think of it like a spinning coin. This allows for massive parallel computation.
+  </source_content>
+</input>
+<output_format>
+{{
+"podcast_transcripts": [
+  {{
+    "speaker_id": 0,
+    "dialog": "Alright team, today we're tackling a big one: Quantum Computing. I know a lot of listeners have been asking, so let's try to demystify it a bit."
+  }},
+  {{
+    "speaker_id": 2,
+    "dialog": "Yes! I'm so excited for this. But honestly, every time I read about it, it feels like science fiction. Where do we even start?"
+  }},
+  {{
+    "speaker_id": 1,
+    "dialog": "That's the perfect place to start, actually. Let's ground it. Forget the 'quantum' part for a second. We all know regular computers use 'bits', right? They're tiny switches, either a zero or a one. On or off. Simple."
+  }},
+  {{
+    "speaker_id": 0,
+    "dialog": "Right, the basic building block of all digital information. So, how do 'qubits'—the quantum version—change the game?"
+  }},
+  {{
+    "speaker_id": 1,
+    "dialog": "This is where the magic happens. A qubit isn't just a zero OR a one. Thanks to a principle called superposition, it can be zero, one, or both at the same time."
+  }},
+  {{
+    "speaker_id": 2,
+    "dialog": "Okay, hold on. 'Both at the same time'? My brain just short-circuited. How is that possible?"
+  }},
+  {{
+    "speaker_id": 1,
+    "dialog": "The classic analogy is a spinning coin. While it's in the air, before it lands, is it heads or tails? It's in a state of both possibilities. A qubit is like that spinning coin, holding multiple values at once."
+  }},
+  {{
+    "speaker_id": 0,
+    "dialog": "Ah, that's a great way to put it. So that 'spinning coin' state is what allows them to be so much more powerful, for massive parallel calculations?"
+  }},
+  {{
+    "speaker_id": 1,
+    "dialog": "Exactly. Because one qubit can hold multiple values, a set of them can explore a huge number of possibilities simultaneously, instead of one by one like a classical computer."
+  }},
+  {{
+    "speaker_id": 2,
+    "dialog": "Wow. Okay, that clicks. It's not just faster, it's a completely different way of thinking about problem-solving."
+  }}
+]
+}}
+</output_format>
+<final>
+Transform the source material into a lively and engaging podcast conversation based on the provided settings. Craft dialogue that showcases authentic group chemistry and natural interaction. Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates and entertains the listener.
+The final output is a JSON string without code blocks.
+</final>
+</podcast_generation_system>