feat(podcast): 添加沉浸故事模式支持多语言播客生成

新增沉浸故事生成模式,支持原文朗读和智能分段:
- 服务端新增generate_podcast_with_story_api函数和专用API端点
- 添加故事模式专用prompt模板(prompt-story-overview.txt和prompt-story-podscript.txt)
- 前端新增模式切换UI,支持AI播客和沉浸故事两种模式
- 沉浸故事模式固定消耗30积分,不需要语言和时长参数
- 优化音频静音裁剪逻辑,保留首尾200ms空白提升自然度
- 修复session管理和错误处理,提升系统稳定性
- 新增多语言配置(中英日)支持模式切换文案
This commit is contained in:
hex2077
2025-10-19 22:09:13 +08:00
parent 321e3cded4
commit dd2a1b536f
18 changed files with 672 additions and 116 deletions

View File

@@ -19,7 +19,7 @@ from tts_adapters import TTSAdapter, IndexTTSAdapter, EdgeTTSAdapter, FishAudioA
# Global configuration
output_dir = "output"
# file_list_path is now generated uniquely for each merge operation
tts_providers_config_path = '../config/tts_providers.json'
tts_providers_config_path = '../config/tts_providers-local.json'
# Global cache for TTS provider configurations
tts_provider_configs_cache = {}
@@ -250,7 +250,7 @@ def get_audio_duration(filepath: str) -> Optional[float]:
print(f"An unexpected error occurred while getting audio duration for {filepath}: {e}")
return None
def trim_audio_silence(input_filepath: str, output_filepath: str, silence_threshold_db: float = -60, min_silence_duration: float = 0.5):
def trim_audio_silence(input_filepath: str, output_filepath: str, silence_threshold_db: float = -60, min_silence_duration: float = 0.5, enable_trim: bool = True):
"""
Removes leading and trailing silence from an audio file using ffmpeg.
@@ -259,7 +259,17 @@ def trim_audio_silence(input_filepath: str, output_filepath: str, silence_thresh
output_filepath (str): Path where the trimmed audio file will be saved.
silence_threshold_db (float): Silence threshold in dB. Audio below this level is considered silence.
min_silence_duration (float): Minimum duration of silence to detect, in seconds.
enable_trim (bool): Whether to enable silence trimming. If False, just copy the file.
"""
# 如果不启用去除空白,直接复制文件
if not enable_trim:
try:
subprocess.run(["ffmpeg", "-i", input_filepath, "-c", "copy", output_filepath], check=True, capture_output=True)
print(f"Silence trimming disabled. Copied {input_filepath} to {output_filepath}")
return
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Error copying audio file: {e}")
try:
# Check if ffmpeg is available
subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
@@ -307,16 +317,21 @@ def trim_audio_silence(input_filepath: str, output_filepath: str, silence_thresh
start_trim_val = 0.0 # Initialize start_trim_val
end_trim_val = current_audio_duration # Initialize end_trim_val with the full duration
# 保留首尾各30ms的空白
padding_ms = 0.2 # 30ms = 0.03秒
if silence_starts and silence_ends:
# Determine leading silence
if silence_starts[0] == 0.0: # Silence at the very beginning
start_trim_val = silence_ends[0]
# 从静音结束处往前保留30ms
start_trim_val = max(0.0, silence_ends[0] - padding_ms)
# Determine trailing silence
# Only consider trimming from the end if there's silence close to the end
if silence_ends[-1] >= (end_trim_val - min_silence_duration):
end_trim_val = silence_starts[-1]
# 从静音开始处往后保留30ms
end_trim_val = min(current_audio_duration, silence_starts[-1] + padding_ms)
# If after trimming, the duration becomes too short or negative, skip trimming
if (end_trim_val - start_trim_val) <= 0.01: # Add a small epsilon to avoid issues with very short audios
@@ -421,9 +436,13 @@ def _prepare_openai_settings(args, config_data):
def _read_prompt_files():
"""Reads content from input, overview, and podcast script prompt files."""
input_prompt = read_file_content('input.txt')
overview_prompt = read_file_content('prompt/prompt-overview.txt')
original_podscript_prompt = read_file_content('prompt/prompt-podscript.txt')
return input_prompt, overview_prompt, original_podscript_prompt
story_overview_prompt = read_file_content('prompt/prompt-story-overview.txt')
story_podscript_prompt = read_file_content('prompt/prompt-story-podscript.txt')
return input_prompt, overview_prompt, original_podscript_prompt, story_overview_prompt, story_podscript_prompt
def _extract_custom_content(input_prompt_content):
"""Extracts custom content from the input prompt."""
@@ -684,8 +703,17 @@ def generate_audio_for_item(item, config_data, tts_adapter, max_retries: int = 3
except Exception as e: # Catch other unexpected errors
raise RuntimeError(f"An unexpected error occurred for speaker {speaker_id} ({voice_code}) on attempt {attempt + 1}: {e}")
def _generate_all_audio_files(podcast_script, config_data, tts_adapter, threads):
"""Orchestrates the generation of individual audio files."""
def _generate_all_audio_files(podcast_script, config_data, tts_adapter, threads, enable_trim_silence: bool = True):
"""
Orchestrates the generation of individual audio files.
Args:
podcast_script: The podcast script containing transcripts.
config_data: Configuration data.
tts_adapter: TTS adapter for audio generation.
threads: Number of threads for parallel processing.
enable_trim_silence: Whether to enable silence trimming for audio files. Default is True.
"""
os.makedirs(output_dir, exist_ok=True)
print("\nGenerating audio files...")
# test script
@@ -712,7 +740,7 @@ def _generate_all_audio_files(podcast_script, config_data, tts_adapter, threads)
if original_audio_file:
# Define a path for the trimmed audio file
trimmed_audio_file = os.path.join(output_dir, f"trimmed_{os.path.basename(original_audio_file)}")
trim_audio_silence(original_audio_file, trimmed_audio_file)
trim_audio_silence(original_audio_file, trimmed_audio_file, enable_trim=enable_trim_silence)
# Use the trimmed file for the final merge
audio_files_dict[index] = trimmed_audio_file
# Clean up the original untrimmed file
@@ -873,7 +901,7 @@ def generate_podcast_audio():
config_data = _load_configuration()
api_key, base_url, model = _prepare_openai_settings(args, config_data)
input_prompt_content, overview_prompt, original_podscript_prompt = _read_prompt_files()
input_prompt_content, overview_prompt, original_podscript_prompt, story_overview_prompt, story_podscript_prompt = _read_prompt_files()
custom_content, input_prompt = _extract_custom_content(input_prompt_content)
podscript_prompt, pod_users, voices, turn_pattern = _prepare_podcast_prompts(config_data, original_podscript_prompt, custom_content, args.usetime, args.output_language)
@@ -886,7 +914,7 @@ def generate_podcast_audio():
tts_adapter = _initialize_tts_adapter(config_data) # 初始化 TTS 适配器,现在返回适配器映射
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads)
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads, enable_trim_silence=True)
file_list_path_created = _create_ffmpeg_file_list(audio_files, len(podcast_script.get("podcast_transcripts", [])))
output_audio_filepath = merge_audio_files(file_list_path_created)
return {
@@ -919,7 +947,7 @@ def generate_podcast_audio_api(args, config_path: str, input_txt_content: str, t
config_data["podUsers"] = podUsers
final_api_key, final_base_url, final_model = _prepare_openai_settings(args, config_data)
input_prompt, overview_prompt, original_podscript_prompt = _read_prompt_files()
input_prompt, overview_prompt, original_podscript_prompt, story_overview_prompt, story_podscript_prompt = _read_prompt_files()
custom_content, input_prompt = _extract_custom_content(input_txt_content)
# Assuming `output_language` is passed directly to the function
podscript_prompt, pod_users, voices, turn_pattern = _prepare_podcast_prompts(config_data, original_podscript_prompt, custom_content, args.usetime, args.output_language)
@@ -933,7 +961,7 @@ def generate_podcast_audio_api(args, config_path: str, input_txt_content: str, t
tts_adapter = _initialize_tts_adapter(config_data, tts_providers_config_content) # 初始化 TTS 适配器,现在返回适配器映射
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads)
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads, enable_trim_silence=True)
file_list_path_created = _create_ffmpeg_file_list(audio_files, len(podcast_script.get("podcast_transcripts", [])))
output_audio_filepath = merge_audio_files(file_list_path_created)
@@ -969,4 +997,62 @@ if __name__ == "__main__":
finally:
end_time = time.time()
execution_time = end_time - start_time
print(f"\nTotal execution time: {execution_time:.2f} seconds")
print(f"\nTotal execution time: {execution_time:.2f} seconds")
def generate_podcast_with_story_api(args, config_path: str, input_txt_content: str, tts_providers_config_content: str, podUsers_json_content: str) -> dict:
"""
Generates a podcast audio file based on the provided parameters.
Args:
api_key (str): OpenAI API key.
base_url (str): OpenAI API base URL.
model (str): OpenAI model to use.
threads (int): Number of threads for audio generation.
config_path (str): Path to the configuration JSON file.
input_txt_content (str): Content of the input prompt.
output_language (str): Language for the podcast overview and script (default: Chinese).
Returns:
str: The path to the generated audio file.
"""
print("Starting podcast audio generation...")
podUsers = json.loads(podUsers_json_content)
config_data = _load_configuration_path(config_path, podUsers)
config_data["podUsers"] = podUsers
final_api_key, final_base_url, final_model = _prepare_openai_settings(args, config_data)
input_prompt, overview_prompt, original_podscript_prompt, story_overview_prompt, story_podscript_prompt = _read_prompt_files()
custom_content, input_prompt = _extract_custom_content(input_txt_content)
# Assuming `output_language` is passed directly to the function
podscript_prompt, pod_users, voices, turn_pattern = _prepare_podcast_prompts(config_data, story_podscript_prompt, custom_content, args.usetime, args.output_language)
print(f"\nInput Prompt (from provided content):\n{input_prompt[:100]}...")
print(f"\nOverview Prompt (prompt-overview.txt):\n{story_overview_prompt[:100]}...")
print(f"\nPodscript Prompt (prompt-podscript.txt):\n{podscript_prompt[:1000]}...")
overview_content, title, tags = _generate_overview_content(final_api_key, final_base_url, final_model, story_overview_prompt, input_prompt, args.output_language)
podcast_script = _generate_podcast_script(final_api_key, final_base_url, final_model, podscript_prompt, input_prompt)
tts_adapter = _initialize_tts_adapter(config_data, tts_providers_config_content) # 初始化 TTS 适配器,现在返回适配器映射
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads, enable_trim_silence=True)
file_list_path_created = _create_ffmpeg_file_list(audio_files, len(podcast_script.get("podcast_transcripts", [])))
output_audio_filepath = merge_audio_files(file_list_path_created)
audio_duration_seconds = get_audio_duration(os.path.join(output_dir, output_audio_filepath))
formatted_duration = "00:00"
if audio_duration_seconds is not None:
minutes = int(audio_duration_seconds // 60)
seconds = int(audio_duration_seconds % 60)
formatted_duration = f"{minutes:02}:{seconds:02}"
task_results = {
"output_audio_filepath": output_audio_filepath,
"podcast_script": podcast_script,
"podUsers": podUsers,
"audio_duration": formatted_duration,
"title": title,
"tags": tags,
}
return task_results