feat(podcast): 添加沉浸故事模式支持多语言播客生成

新增沉浸故事生成模式,支持原文朗读和智能分段:
- 服务端新增generate_podcast_with_story_api函数和专用API端点
- 添加故事模式专用prompt模板(prompt-story-overview.txt和prompt-story-podscript.txt)
- 前端新增模式切换UI,支持AI播客和沉浸故事两种模式
- 沉浸故事模式固定消耗30积分,不需要语言和时长参数
- 优化音频静音裁剪逻辑,保留首尾200ms空白提升自然度
- 修复session管理和错误处理,提升系统稳定性
- 新增多语言配置(中英日)支持模式切换文案
This commit is contained in:
hex2077
2025-10-19 22:09:13 +08:00
parent 321e3cded4
commit dd2a1b536f
18 changed files with 672 additions and 116 deletions

View File

@@ -22,7 +22,7 @@ import httpx # 导入 httpx 库
from io import BytesIO # 导入 BytesIO
import base64 # 导入 base64
from podcast_generator import generate_podcast_audio_api
from podcast_generator import generate_podcast_audio_api, generate_podcast_with_story_api
class TaskStatus(str, Enum):
PENDING = "pending"
@@ -214,6 +214,7 @@ async def _generate_podcast_task(
output_language: Optional[str] = None,
usetime: Optional[str] = None,
lang: Optional[str] = None,
use_story_mode: bool = False, # 新增参数,是否使用故事模式
):
task_results[auth_id][task_id]["status"] = TaskStatus.RUNNING
try:
@@ -230,14 +231,25 @@ async def _generate_podcast_task(
if not actual_config_path:
raise ValueError(f"Invalid tts_provider: {tts_provider}.")
podcast_generation_results = await asyncio.to_thread(
generate_podcast_audio_api,
args=args,
config_path=actual_config_path,
input_txt_content=input_txt_content.strip(),
tts_providers_config_content=tts_providers_config_content.strip(),
podUsers_json_content=podUsers_json_content.strip()
)
# 根据 use_story_mode 参数决定调用哪个函数
if use_story_mode:
podcast_generation_results = await asyncio.to_thread(
generate_podcast_with_story_api,
args=args,
config_path=actual_config_path,
input_txt_content=input_txt_content.strip(),
tts_providers_config_content=tts_providers_config_content.strip(),
podUsers_json_content=podUsers_json_content.strip()
)
else:
podcast_generation_results = await asyncio.to_thread(
generate_podcast_audio_api,
args=args,
config_path=actual_config_path,
input_txt_content=input_txt_content.strip(),
tts_providers_config_content=tts_providers_config_content.strip(),
podUsers_json_content=podUsers_json_content.strip()
)
task_results[auth_id][task_id]["status"] = TaskStatus.COMPLETED
task_results[auth_id][task_id].update(podcast_generation_results)
print(f"\nPodcast generation completed for task {task_id}. Output file: {podcast_generation_results.get('output_audio_filepath')}")
@@ -266,10 +278,11 @@ async def _generate_podcast_task(
"task_id": str(task_id),
"auth_id": auth_id,
"task_results": task_results[auth_id][task_id],
"timestamp": int(time.time()),
"timestamp": int(time.time()),
"status": task_results[auth_id][task_id]["status"],
"usetime": usetime,
"lang": lang,
"mode": "ai-story" if use_story_mode else "normal",
}
MAX_RETRIES = 3 # 定义最大重试次数
@@ -293,7 +306,34 @@ async def _generate_podcast_task(
print(f"Retrying callback for task {task_id} in {RETRY_DELAY} seconds...")
await asyncio.sleep(RETRY_DELAY)
else:
print(f"Callback failed for task {task_id} after {MAX_RETRIES} attempts.")
print(f"Callback failed for task {task_id} after {MAX_RETRIES} attempts.")
async def _generate_podcast_with_story_task(
task_id: UUID,
auth_id: str,
api_key: str,
base_url: str,
model: str,
input_txt_content: str,
tts_providers_config_content: str,
podUsers_json_content: str,
threads: int,
tts_provider: str,
callback_url: Optional[str] = None, # 新增回调地址参数
output_language: Optional[str] = None,
usetime: Optional[str] = None,
lang: Optional[str] = None,
):
"""
调用带优化流程的播客生成任务处理函数
"""
return await _generate_podcast_task(
task_id, auth_id, api_key, base_url, model, input_txt_content,
tts_providers_config_content, podUsers_json_content, threads,
tts_provider, callback_url, output_language, usetime, lang,
use_story_mode=True
)
# @app.post("/generate-podcast", dependencies=[Depends(verify_signature)])
@app.post("/generate-podcast")
@@ -354,6 +394,65 @@ async def generate_podcast_submission(
return {"message": "Podcast generation started.", "task_id": task_id}
@app.post("/generate-podcast-with-story")
async def generate_podcast_with_story_submission(
background_tasks: BackgroundTasks,
auth_id: str = Depends(get_auth_id),
api_key: str = Form("OpenAI API key."),
base_url: str = Form("https://api.openai.com/v1"),
model: str = Form("gpt-3.5-turbo"),
input_txt_content: str = Form(...),
tts_providers_config_content: str = Form(...),
podUsers_json_content: str = Form(...),
threads: int = Form(1),
tts_provider: str = Form("index-tts"),
callback_url: Optional[str] = Form(None),
output_language: Optional[str] = Form(None),
usetime: Optional[str] = Form(None),
lang: Optional[str] = Form(None),
):
# 1. 验证 tts_provider
if tts_provider not in tts_provider_map:
raise HTTPException(status_code=400, detail=f"Invalid tts_provider: {tts_provider}.")
# 2. 检查此 auth_id 是否有正在运行的任务
if auth_id in task_results:
for existing_task_id, existing_task_info in task_results[auth_id].items():
if existing_task_info["status"] in (TaskStatus.RUNNING, TaskStatus.PENDING):
raise HTTPException(status_code=409, detail=f"There is already a running task (ID: {existing_task_id}) for this auth_id. Please wait for it to complete.")
task_id = uuid.uuid4()
if auth_id not in task_results:
task_results[auth_id] = {}
task_results[auth_id][task_id] = {
"status": TaskStatus.PENDING,
"result": None,
"timestamp": time.time(),
"callback_url": callback_url, # 存储回调地址
"auth_id": auth_id, # 存储 auth_id
}
background_tasks.add_task(
_generate_podcast_with_story_task,
task_id,
auth_id,
api_key,
base_url,
model,
input_txt_content,
tts_providers_config_content,
podUsers_json_content,
threads,
tts_provider,
callback_url,
output_language,
usetime,
lang,
)
return {"message": "Podcast generation with story started.", "task_id": task_id}
# @app.get("/podcast-status", dependencies=[Depends(verify_signature)])
@app.get("/podcast-status")
async def get_podcast_status(

View File

@@ -19,7 +19,7 @@ from tts_adapters import TTSAdapter, IndexTTSAdapter, EdgeTTSAdapter, FishAudioA
# Global configuration
output_dir = "output"
# file_list_path is now generated uniquely for each merge operation
tts_providers_config_path = '../config/tts_providers.json'
tts_providers_config_path = '../config/tts_providers-local.json'
# Global cache for TTS provider configurations
tts_provider_configs_cache = {}
@@ -250,7 +250,7 @@ def get_audio_duration(filepath: str) -> Optional[float]:
print(f"An unexpected error occurred while getting audio duration for {filepath}: {e}")
return None
def trim_audio_silence(input_filepath: str, output_filepath: str, silence_threshold_db: float = -60, min_silence_duration: float = 0.5):
def trim_audio_silence(input_filepath: str, output_filepath: str, silence_threshold_db: float = -60, min_silence_duration: float = 0.5, enable_trim: bool = True):
"""
Removes leading and trailing silence from an audio file using ffmpeg.
@@ -259,7 +259,17 @@ def trim_audio_silence(input_filepath: str, output_filepath: str, silence_thresh
output_filepath (str): Path where the trimmed audio file will be saved.
silence_threshold_db (float): Silence threshold in dB. Audio below this level is considered silence.
min_silence_duration (float): Minimum duration of silence to detect, in seconds.
enable_trim (bool): Whether to enable silence trimming. If False, just copy the file.
"""
# 如果不启用去除空白,直接复制文件
if not enable_trim:
try:
subprocess.run(["ffmpeg", "-i", input_filepath, "-c", "copy", output_filepath], check=True, capture_output=True)
print(f"Silence trimming disabled. Copied {input_filepath} to {output_filepath}")
return
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Error copying audio file: {e}")
try:
# Check if ffmpeg is available
subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
@@ -307,16 +317,21 @@ def trim_audio_silence(input_filepath: str, output_filepath: str, silence_thresh
start_trim_val = 0.0 # Initialize start_trim_val
end_trim_val = current_audio_duration # Initialize end_trim_val with the full duration
# 保留首尾各30ms的空白
padding_ms = 0.2 # 30ms = 0.03秒
if silence_starts and silence_ends:
# Determine leading silence
if silence_starts[0] == 0.0: # Silence at the very beginning
start_trim_val = silence_ends[0]
# 从静音结束处往前保留30ms
start_trim_val = max(0.0, silence_ends[0] - padding_ms)
# Determine trailing silence
# Only consider trimming from the end if there's silence close to the end
if silence_ends[-1] >= (end_trim_val - min_silence_duration):
end_trim_val = silence_starts[-1]
# 从静音开始处往后保留30ms
end_trim_val = min(current_audio_duration, silence_starts[-1] + padding_ms)
# If after trimming, the duration becomes too short or negative, skip trimming
if (end_trim_val - start_trim_val) <= 0.01: # Add a small epsilon to avoid issues with very short audios
@@ -421,9 +436,13 @@ def _prepare_openai_settings(args, config_data):
def _read_prompt_files():
"""Reads content from input, overview, and podcast script prompt files."""
input_prompt = read_file_content('input.txt')
overview_prompt = read_file_content('prompt/prompt-overview.txt')
original_podscript_prompt = read_file_content('prompt/prompt-podscript.txt')
return input_prompt, overview_prompt, original_podscript_prompt
story_overview_prompt = read_file_content('prompt/prompt-story-overview.txt')
story_podscript_prompt = read_file_content('prompt/prompt-story-podscript.txt')
return input_prompt, overview_prompt, original_podscript_prompt, story_overview_prompt, story_podscript_prompt
def _extract_custom_content(input_prompt_content):
"""Extracts custom content from the input prompt."""
@@ -684,8 +703,17 @@ def generate_audio_for_item(item, config_data, tts_adapter, max_retries: int = 3
except Exception as e: # Catch other unexpected errors
raise RuntimeError(f"An unexpected error occurred for speaker {speaker_id} ({voice_code}) on attempt {attempt + 1}: {e}")
def _generate_all_audio_files(podcast_script, config_data, tts_adapter, threads):
"""Orchestrates the generation of individual audio files."""
def _generate_all_audio_files(podcast_script, config_data, tts_adapter, threads, enable_trim_silence: bool = True):
"""
Orchestrates the generation of individual audio files.
Args:
podcast_script: The podcast script containing transcripts.
config_data: Configuration data.
tts_adapter: TTS adapter for audio generation.
threads: Number of threads for parallel processing.
enable_trim_silence: Whether to enable silence trimming for audio files. Default is True.
"""
os.makedirs(output_dir, exist_ok=True)
print("\nGenerating audio files...")
# test script
@@ -712,7 +740,7 @@ def _generate_all_audio_files(podcast_script, config_data, tts_adapter, threads)
if original_audio_file:
# Define a path for the trimmed audio file
trimmed_audio_file = os.path.join(output_dir, f"trimmed_{os.path.basename(original_audio_file)}")
trim_audio_silence(original_audio_file, trimmed_audio_file)
trim_audio_silence(original_audio_file, trimmed_audio_file, enable_trim=enable_trim_silence)
# Use the trimmed file for the final merge
audio_files_dict[index] = trimmed_audio_file
# Clean up the original untrimmed file
@@ -873,7 +901,7 @@ def generate_podcast_audio():
config_data = _load_configuration()
api_key, base_url, model = _prepare_openai_settings(args, config_data)
input_prompt_content, overview_prompt, original_podscript_prompt = _read_prompt_files()
input_prompt_content, overview_prompt, original_podscript_prompt, story_overview_prompt, story_podscript_prompt = _read_prompt_files()
custom_content, input_prompt = _extract_custom_content(input_prompt_content)
podscript_prompt, pod_users, voices, turn_pattern = _prepare_podcast_prompts(config_data, original_podscript_prompt, custom_content, args.usetime, args.output_language)
@@ -886,7 +914,7 @@ def generate_podcast_audio():
tts_adapter = _initialize_tts_adapter(config_data) # 初始化 TTS 适配器,现在返回适配器映射
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads)
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads, enable_trim_silence=True)
file_list_path_created = _create_ffmpeg_file_list(audio_files, len(podcast_script.get("podcast_transcripts", [])))
output_audio_filepath = merge_audio_files(file_list_path_created)
return {
@@ -919,7 +947,7 @@ def generate_podcast_audio_api(args, config_path: str, input_txt_content: str, t
config_data["podUsers"] = podUsers
final_api_key, final_base_url, final_model = _prepare_openai_settings(args, config_data)
input_prompt, overview_prompt, original_podscript_prompt = _read_prompt_files()
input_prompt, overview_prompt, original_podscript_prompt, story_overview_prompt, story_podscript_prompt = _read_prompt_files()
custom_content, input_prompt = _extract_custom_content(input_txt_content)
# Assuming `output_language` is passed directly to the function
podscript_prompt, pod_users, voices, turn_pattern = _prepare_podcast_prompts(config_data, original_podscript_prompt, custom_content, args.usetime, args.output_language)
@@ -933,7 +961,7 @@ def generate_podcast_audio_api(args, config_path: str, input_txt_content: str, t
tts_adapter = _initialize_tts_adapter(config_data, tts_providers_config_content) # 初始化 TTS 适配器,现在返回适配器映射
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads)
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads, enable_trim_silence=True)
file_list_path_created = _create_ffmpeg_file_list(audio_files, len(podcast_script.get("podcast_transcripts", [])))
output_audio_filepath = merge_audio_files(file_list_path_created)
@@ -969,4 +997,62 @@ if __name__ == "__main__":
finally:
end_time = time.time()
execution_time = end_time - start_time
print(f"\nTotal execution time: {execution_time:.2f} seconds")
print(f"\nTotal execution time: {execution_time:.2f} seconds")
def generate_podcast_with_story_api(args, config_path: str, input_txt_content: str, tts_providers_config_content: str, podUsers_json_content: str) -> dict:
"""
Generates a podcast audio file based on the provided parameters.
Args:
api_key (str): OpenAI API key.
base_url (str): OpenAI API base URL.
model (str): OpenAI model to use.
threads (int): Number of threads for audio generation.
config_path (str): Path to the configuration JSON file.
input_txt_content (str): Content of the input prompt.
output_language (str): Language for the podcast overview and script (default: Chinese).
Returns:
str: The path to the generated audio file.
"""
print("Starting podcast audio generation...")
podUsers = json.loads(podUsers_json_content)
config_data = _load_configuration_path(config_path, podUsers)
config_data["podUsers"] = podUsers
final_api_key, final_base_url, final_model = _prepare_openai_settings(args, config_data)
input_prompt, overview_prompt, original_podscript_prompt, story_overview_prompt, story_podscript_prompt = _read_prompt_files()
custom_content, input_prompt = _extract_custom_content(input_txt_content)
# Assuming `output_language` is passed directly to the function
podscript_prompt, pod_users, voices, turn_pattern = _prepare_podcast_prompts(config_data, story_podscript_prompt, custom_content, args.usetime, args.output_language)
print(f"\nInput Prompt (from provided content):\n{input_prompt[:100]}...")
print(f"\nOverview Prompt (prompt-overview.txt):\n{story_overview_prompt[:100]}...")
print(f"\nPodscript Prompt (prompt-podscript.txt):\n{podscript_prompt[:1000]}...")
overview_content, title, tags = _generate_overview_content(final_api_key, final_base_url, final_model, story_overview_prompt, input_prompt, args.output_language)
podcast_script = _generate_podcast_script(final_api_key, final_base_url, final_model, podscript_prompt, input_prompt)
tts_adapter = _initialize_tts_adapter(config_data, tts_providers_config_content) # 初始化 TTS 适配器,现在返回适配器映射
audio_files = _generate_all_audio_files(podcast_script, config_data, tts_adapter, args.threads, enable_trim_silence=True)
file_list_path_created = _create_ffmpeg_file_list(audio_files, len(podcast_script.get("podcast_transcripts", [])))
output_audio_filepath = merge_audio_files(file_list_path_created)
audio_duration_seconds = get_audio_duration(os.path.join(output_dir, output_audio_filepath))
formatted_duration = "00:00"
if audio_duration_seconds is not None:
minutes = int(audio_duration_seconds // 60)
seconds = int(audio_duration_seconds % 60)
formatted_duration = f"{minutes:02}:{seconds:02}"
task_results = {
"output_audio_filepath": output_audio_filepath,
"podcast_script": podcast_script,
"podUsers": podUsers,
"audio_duration": formatted_duration,
"title": title,
"tags": tags,
}
return task_results

View File

@@ -76,10 +76,6 @@ You are a master podcast scriptwriter, adept at transforming diverse input conte
* **Debate & Contrasting Views:** Use the host personas to create discussions from different perspectives, compelling other hosts to provide more detailed defenses and explanations.
* **Restatement & Summary:** The host (`speaker_0`) should provide restatements and summaries during pauses in the discussion and at the end of topics.
8. **Copy & Replacement:**
If a hyphen connects English letters and numbers or letters on both sides, replace it with a space.
Replace four-digit Arabic numerals with their Chinese character equivalents, one-to-one.
</guidelines>
<examples>
@@ -90,7 +86,7 @@ You are a master podcast scriptwriter, adept at transforming diverse input conte
<turn_pattern>random</turn_pattern>
</podcast_settings>
<source_content>
Quantum computing uses quantum bits or qubits which can exist in multiple states simultaneously due to superposition. This is different from classical bits (0 or 1). Think of it like a spinning coin. This allows for massive parallel computation.
{{input_content}}
</source_content>
</input>
<output_format>
@@ -139,6 +135,8 @@ You are a master podcast scriptwriter, adept at transforming diverse input conte
]
}}
</output_format>
</examples>
<final>
Transform the source material into a lively and engaging podcast conversation based on the provided settings. Craft dialogue that showcases authentic group chemistry and natural interaction. Use varied speech patterns reflecting real human conversation, ensuring the final script effectively educates and entertains the listener.
The final output is a JSON string without code blocks.

View File

@@ -0,0 +1,21 @@
**1. Metadata Generation**
* **Step 1: Intermediate Core Summary Generation (Internal Step)**
* **Task**: First, generate a core idea summary of approximately 150 characters based *only* on the **[body content]** of the document (ignoring titles and subtitles).
* **Position**: As the **four line** of the final output.
* **Step 2: Title Generation**
* **Source**: Must be refined from the "core summary" generated in the previous step.
* **Length**: Strictly controlled to be between 15-20 characters.
* **Format**: Adopt a "Main Title: Subtitle" structure, using a full-width colon ":" for separation. For example: "Brevity and Precision: Practical Engineering for AI Context".
* **Position**: As the **first line** of the final output.
* **Step 3: Tag Generation**
* **Source**: Extract from the **[body content]** of the document (ignoring titles and subtitles).
* **Quantity**: 3 to 5.
* **Format**: Keywords separated by the "#" symbol (e.g., #Keyword1#Keyword2).
* **Position**: As the **second line** of the final output.
**2. Output Language**
* ** Make sure the language of the output content is the original input language **.

View File

@@ -0,0 +1,119 @@
* **Output Format:** No explanatory text! The final output is a JSON string without code blocks. Make sure the language of the output content is the same as the source content.
* **End Format:** Do not add any summary or concluding remarks. The output must be only the JSON object.
<podcast_generation_system>
You are an intelligent text-processing system. Your task is to take the input content, segment it into complete sentences, assign speaker IDs according to the rules, and output the result as a raw JSON string, preserving the original text.
<input>
<!-- Podcast settings provide high-level configuration for the script generation. -->
<podcast_settings>
<!-- Define the total number of speakers. Minimum 1. Every speaker must be assigned at least one statement. -->
<num_speakers>{{numSpeakers}}</num_speakers>
</podcast_settings>
<!-- The source_content contains the text to be processed. -->
<source_content>
{{input_content}}
</source_content>
</input>
<guidelines>
1. **Primary Goal & Output Format:**
* Your only task is to convert the `<source_content>` into a JSON string.
* The output must be a single JSON object with one key: `"podcast_transcripts"`.
* The value of `"podcast_transcripts"` must be an array of objects, where each object has two keys: `"speaker_id"` (an integer) and `"dialog"` (a string).
* **Strictly output only the JSON string.** Do not include any explanations, comments, or code block formatting (like ```json).
2. **Text Segmentation:**
* Analyze the `<source_content>` and break it down into logical, complete sentences or statements.
* Segmentation should occur at natural punctuation marks (e.g., periods, question marks, exclamation points) or logical breaks in the flow of a single speaker's thought.
* **Crucially, you must not alter, summarize, or rewrite the original text.** The content of the `"dialog"` field must be an exact segment from the source.
* The output language must be identical to the input language.
3. **Speaker ID Assignment Logic (Roles):**
* **If Source Content Contains Speaker Roles:** If the `source_content` explicitly identifies speakers (e.g., "主持人:", "嘉宾A:", "Speaker 1:", "角色A"), you must map these roles to unique, consistent `speaker_id` integers (starting from 0). For example, "主持人" is always `speaker_id: 0`, "嘉宾A" is always `speaker_id: 1`, etc. Remove the role identifier (e.g., "主持人:") from the beginning of the `"dialog"` string.
* **If Source Content Has No Roles:** Proceed to Guideline 4 for automatic assignment.
4. **Speaker Assignment & Distribution Logic (Automatic):**
* **Rule 1 (Highest Priority): Logical Grouping.** This is the most important rule. Analyze the flow of the `<source_content>`. If multiple consecutive sentences form a single coherent thought, argument, or detailed explanation, they **must be assigned to the same `speaker_id`**. This is to ensure that a single speaker can fully develop a point before another speaker takes over. It is perfectly acceptable and encouraged for one speaker to have several consecutive dialogue blocks.
* **Rule 2: Speaker Variation.** After applying the logical grouping rule, distribute the resulting sentences or logical blocks among the different speakers to create a varied conversation. Switch speakers at logical transition points in the text, where the topic or perspective shifts.
* **Rule 3: Mandatory Speaker Inclusion.** You **must** ensure that every speaker, from `speaker_id: 0` to `speaker_id: num_speakers - 1`, is assigned at least one line of dialogue. Before finalizing the output, verify that all speakers have participated.
5. **Content Integrity:**
* The entire `<source_content>` must be processed and included in the final JSON output. No part of the original text should be omitted.
* The sum of all `"dialog"` strings in the output should reconstruct the original `<source_content>` (excluding any speaker role prefixes).
</guidelines>
<examples>
<!-- Example 1: Input with no speaker roles, demonstrating logical grouping -->
<input>
<podcast_settings>
<num_speakers>2</num_speakers>
</podcast_settings>
<source_content>
人工智能的发展进入了一个新阶段。其核心驱动力是大型语言模型的突破。这些模型能够理解和生成极其自然的文本,应用前景广阔。然而,我们也必须关注其伦理风险和潜在的滥用问题。
</source_content>
</input>
<output_format>
{{
"podcast_transcripts": [
{{
"speaker_id": 0,
"dialog": "人工智能的发展进入了一个新阶段。"
}},
{{
"speaker_id": 0,
"dialog": "其核心驱动力是大型语言模型的突破。"
}},
{{
"speaker_id": 0,
"dialog": "这些模型能够理解和生成极其自然的文本,应用前景广阔。"
}},
{{
"speaker_id": 1,
"dialog": "然而,我们也必须关注其伦理风险和潜在的滥用问题。"
}}
]
}}
</output_format>
<!-- Example 2: Input with explicit speaker roles -->
<input>
<podcast_settings>
<num_speakers>2</num_speakers>
</podcast_settings>
<source_content>
主持人: 大家好,欢迎收听。今天我们来聊聊人工智能。
嘉宾: 是的,主持人。人工智能最近发展很快,特别是在大模型领域。
</source_content>
</input>
<output_format>
{{
"podcast_transcripts": [
{{
"speaker_id": 0,
"dialog": "大家好,欢迎收听。"
}},
{{
"speaker_id": 0,
"dialog": "今天我们来聊聊人工智能。"
}},
{{
"speaker_id": 1,
"dialog": "是的,主持人。"
}},
{{
"speaker_id": 1,
"dialog": "人工智能最近发展很快,特别是在大模型领域。"
}}
]
}}
</output_format>
</examples>
<final>
Adhering strictly to all guidelines, process the input `<source_content>` and generate only the final JSON string. The output must be perfectly formatted JSON and nothing else.
</final>
</podcast_generation_system>

View File

@@ -103,7 +103,9 @@
"english": "English",
"japanese": "Japanese",
"under5Minutes": "5 minutes or less",
"between8And15Minutes": "8-15 minutes"
"between8And15Minutes": "8-15 minutes",
"aiPodcast": "AI Podcast",
"immersiveStory": "Immersive Story"
},
"podcastTabs": {
"script": "Script",

View File

@@ -103,7 +103,9 @@
"english": "英語",
"japanese": "日本語",
"under5Minutes": "約5分",
"between8And15Minutes": "8〜15分"
"between8And15Minutes": "8〜15分",
"aiPodcast": "AIポッドキャスト",
"immersiveStory": "没入型ストーリー"
},
"podcastTabs": {
"script": "スクリプト",

View File

@@ -103,7 +103,9 @@
"english": "英文",
"japanese": "日文",
"under5Minutes": "5分钟左右",
"between8And15Minutes": "8-15分钟"
"between8And15Minutes": "8-15分钟",
"aiPodcast": "AI播客",
"immersiveStory": "沉浸故事"
},
"podcastTabs": {
"script": "脚本",

View File

@@ -169,8 +169,13 @@ export default function HomePage({ params }: { params: Promise<{ lang: string }>
try {
// info('开始生成播客', '正在处理您的请求...');
// 根据模式选择不同的 API 端点
const apiEndpoint = request.mode === 'ai-story'
? '/api/generate-podcast-with-story'
: '/api/generate-podcast';
// 直接发送JSON格式的请求体
const response = await fetch('/api/generate-podcast', {
const response = await fetch(apiEndpoint, {
method: 'POST',
headers: {
'Content-Type': 'application/json',

View File

@@ -0,0 +1,131 @@
import { NextRequest, NextResponse } from 'next/server';
import { startPodcastWithStoryGenerationTask } from '@/lib/podcastApi';
import type { PodcastGenerationRequest } from '@/types';
import { getSessionData } from '@/lib/server-actions';
import { getUserPoints } from '@/lib/points';
import { fetchAndCacheProvidersLocal } from '@/lib/config-local';
import { getTranslation } from '@/i18n';
import { getLanguageFromRequest } from '@/lib/utils';
const enableTTSConfigPage = process.env.NEXT_PUBLIC_ENABLE_TTS_CONFIG_PAGE === 'true';
export async function POST(request: NextRequest) {
const lang = getLanguageFromRequest(request);
const { t } = await getTranslation(lang, 'errors');
const session = await getSessionData();
const userId = session.user?.id;
if (!userId) {
return NextResponse.json(
{ success: false, error: t('user_not_logged_in_or_session_expired') },
{ status: 403 }
);
}
try {
const body: PodcastGenerationRequest = await request.json();
// 参数校验
if (!body.input_txt_content || body.input_txt_content.trim().length === 0) {
return NextResponse.json(
{ success: false, error: t('request_body_cannot_be_empty') },
{ status: 400 }
);
}
if (!body.tts_provider || body.tts_provider.trim().length === 0) {
return NextResponse.json(
{ success: false, error: t('tts_provider_cannot_be_empty') },
{ status: 400 }
);
}
let podUsers: any[] = [];
try {
podUsers = JSON.parse(body.podUsers_json_content || '[]');
if (podUsers.length === 0) {
return NextResponse.json(
{ success: false, error: t('please_select_at_least_one_speaker') },
{ status: 400 }
);
}
} catch (e) {
return NextResponse.json(
{ success: false, error: t('invalid_speaker_config_format') },
{ status: 400 }
);
}
// 1. 查询用户积分
const currentPoints = await getUserPoints(userId);
const POINTS_PER_STORY = 30; // 沉浸故事模式固定消耗30积分
// 2. 检查积分是否足够
if (currentPoints === null || currentPoints < POINTS_PER_STORY) {
return NextResponse.json(
{ success: false, error: t('insufficient_points_for_podcast', { pointsNeeded: POINTS_PER_STORY, currentPoints: currentPoints || 0 }) },
{ status: 402 }
);
}
// 沉浸故事模式不需要验证语言和时长参数
// 根据 enableTTSConfigPage 构建最终的 request
let finalRequest: PodcastGenerationRequest;
if (enableTTSConfigPage) {
// 如果启用配置页面,则直接使用前端传入的 body
if (body.tts_providers_config_content === undefined || body.api_key === undefined || body.base_url === undefined || body.model === undefined) {
return NextResponse.json(
{ success: false, error: t('missing_frontend_tts_config') },
{ status: 400 }
);
}
finalRequest = body as PodcastGenerationRequest;
} else {
// 如果未启用配置页面,则在后端获取 TTS 配置
const settings = await fetchAndCacheProvidersLocal(lang);
if (!settings || !settings.apikey || !settings.model) {
return NextResponse.json(
{ success: false, error: t('incomplete_backend_tts_config') },
{ status: 500 }
);
}
finalRequest = {
input_txt_content: body.input_txt_content,
tts_provider: body.tts_provider,
podUsers_json_content: body.podUsers_json_content,
tts_providers_config_content: JSON.stringify(settings),
api_key: settings.apikey,
base_url: settings.baseurl,
model: settings.model,
} as PodcastGenerationRequest;
}
const callback_url = process.env.NEXT_PUBLIC_PODCAST_CALLBACK_URL || ""
finalRequest.callback_url = callback_url;
// 调用沉浸故事生成任务
const result = await startPodcastWithStoryGenerationTask(finalRequest, userId, lang);
if (result.success) {
return NextResponse.json({
success: true,
data: result.data,
});
} else {
return NextResponse.json(
{ success: false, error: result.error },
{ status: result.statusCode || 400 }
);
}
} catch (error: any) {
console.error('Error in generate-podcast-with-story API:', error);
const statusCode = error.statusCode || 500;
return NextResponse.json(
{ success: false, error: error.message || t('internal_server_error_default') },
{ status: statusCode }
);
}
}

View File

@@ -58,11 +58,16 @@ export async function POST(request: NextRequest) {
// 1. 查询用户积分
const currentPoints = await getUserPoints(userId);
const POINTS_PER_PODCAST = parseInt(process.env.POINTS_PER_PODCAST || '10', 10); // 从环境变量获取默认10
// 2. 检查积分是否足够
if (currentPoints === null || currentPoints < POINTS_PER_PODCAST) {
// 2. 根据时长计算需要的积分
let pointsToDeduct = parseInt(process.env.POINTS_PER_PODCAST || '10', 10); // 从环境变量获取默认10
if(body.usetime === '8-15 minutes') {
pointsToDeduct = pointsToDeduct * 2;
}
// 3. 检查积分是否足够
if (currentPoints === null || currentPoints < pointsToDeduct) {
return NextResponse.json(
{ success: false, error: t('insufficient_points_for_podcast', { pointsNeeded: POINTS_PER_PODCAST, currentPoints: currentPoints || 0 }) },
{ success: false, error: t('insufficient_points_for_podcast', { pointsNeeded: pointsToDeduct, currentPoints: currentPoints || 0 }) },
{ status: 402 } // 402 Forbidden - 权限不足,因为积分不足
);
}

View File

@@ -30,7 +30,7 @@ export async function GET(request: NextRequest) { // GET 函数接收 request
}
export async function PUT(request: NextRequest) {
const { task_id, auth_id, timestamp, status, usetime, lang } = await request.json();
const { task_id, auth_id, timestamp, status, usetime, mode, lang } = await request.json();
const { t } = await getTranslation(lang, 'errors'); // 初始化翻译
try {
if(status !== 'completed') {
@@ -62,9 +62,16 @@ export async function PUT(request: NextRequest) {
const userId = auth_id; // 这里假设 auth_id 就是 userId
// 5. 扣减积分
let pointsToDeduct = parseInt(process.env.POINTS_PER_PODCAST || '10', 10); // 从环境变量获取默认10
if(usetime === '8-15 minutes') {
pointsToDeduct = pointsToDeduct * 2;
let pointsToDeduct: number;
if (mode === 'ai-story') {
// 沉浸故事模式固定消耗30积分
pointsToDeduct = 30;
} else {
// AI播客模式根据时长计算积分
pointsToDeduct = parseInt(process.env.POINTS_PER_PODCAST || '10', 10);
if(usetime === '8-15 minutes') {
pointsToDeduct = pointsToDeduct * 2;
}
}
const reasonCode = "podcast_generation";

View File

@@ -9,6 +9,7 @@ import {
AiOutlineGlobal,
AiOutlineDown,
AiOutlineLoading3Quarters,
AiOutlineStar
} from 'react-icons/ai';
import {
Wand2,
@@ -66,7 +67,7 @@ const PodcastCreator: React.FC<PodcastCreatorProps> = ({
const [topic, setTopic] = useState('');
const [customInstructions, setCustomInstructions] = useState('');
const [selectedMode, setSelectedMode] = useState<'ai-podcast' | 'flowspeech'>('ai-podcast');
const [selectedMode, setSelectedMode] = useState<'ai-podcast' | 'ai-story'>('ai-podcast');
// 初始化时从 localStorage 加载 topic 和 customInstructions
useEffect(() => {
@@ -144,7 +145,14 @@ const PodcastCreator: React.FC<PodcastCreatorProps> = ({
const fileInputRef = useRef<HTMLInputElement>(null);
const { toasts, error, success, removeToast } = useToast(); // 使用 useToast hook, 引入 success
const { data: session } = useSession(); // 获取 session
const { data: session, isPending, error: sessionError } = useSession(); // 获取 session 及其状态
// 处理 session 错误
useEffect(() => {
if (sessionError) {
console.error('Session error:', sessionError);
}
}, [sessionError]);
const handleSubmit = async () => { // 修改为 async 函数
if (!session?.user) { // 判断是否登录
@@ -171,16 +179,18 @@ const PodcastCreator: React.FC<PodcastCreatorProps> = ({
const handleConfirmGenerate = async () => {
let inputTxtContent = topic.trim();
if (customInstructions.trim()) {
// 只在 AI 播客模式下添加自定义指令
if (selectedMode === 'ai-podcast' && customInstructions.trim()) {
inputTxtContent = "```custom-begin"+`\n${customInstructions.trim()}\n`+"```custom-end"+`\n${inputTxtContent}`;
}
const request: PodcastGenerationRequest = {
// 根据模式构建不同的请求参数
const baseRequest = {
tts_provider: selectedConfigName.replace('.json', ''),
input_txt_content: inputTxtContent,
podUsers_json_content: JSON.stringify(selectedPodcastVoices[selectedConfigName] || []),
usetime: duration,
output_language: language,
mode: selectedMode, // 添加模式标识
...(enableTTSConfigPage ? {
tts_providers_config_content: JSON.stringify(settings),
api_key: settings?.apikey,
@@ -189,6 +199,15 @@ const PodcastCreator: React.FC<PodcastCreatorProps> = ({
} : {})
};
// 只在 AI 播客模式下添加语言和时长参数
const request: PodcastGenerationRequest = selectedMode === 'ai-podcast'
? {
...baseRequest,
usetime: duration,
output_language: language,
}
: baseRequest;
try {
await onGenerate(request); // 等待 API 调用完成
// 清空 topic 和 customInstructions并更新 localStorage
@@ -312,7 +331,7 @@ const PodcastCreator: React.FC<PodcastCreatorProps> = ({
</h1>
{/* 模式切换按钮 todo */}
{/* <div className="flex items-center justify-center gap-2 sm:gap-4 mb-8 flex-wrap">
<div className="flex items-center justify-center gap-2 sm:gap-4 mb-8 flex-wrap">
<button
onClick={() => setSelectedMode('ai-podcast')}
className={cn(
@@ -323,21 +342,21 @@ const PodcastCreator: React.FC<PodcastCreatorProps> = ({
)}
>
<AiFillPlayCircle className="w-4 h-4" />
AI播客
{t('podcastCreator.aiPodcast')}
</button>
<button
onClick={() => setSelectedMode('flowspeech')}
onClick={() => setSelectedMode('ai-story')}
className={cn(
"flex items-center gap-2 px-4 py-2 sm:px-6 sm:py-3 rounded-full font-medium transition-all duration-200",
selectedMode === 'flowspeech'
selectedMode === 'ai-story'
? "btn-primary"
: "btn-secondary"
)}
>
<AiOutlineStar className="w-4 h-4" />
FlowSpeech
{t('podcastCreator.immersiveStory')}
</button>
</div> */}
</div>
</div>
{/* 主要创作区域 */}
@@ -356,7 +375,7 @@ const PodcastCreator: React.FC<PodcastCreatorProps> = ({
/>
{/* 自定义指令 */}
{customInstructions !== undefined && (
{customInstructions !== undefined && selectedMode === 'ai-podcast' && (
<div className="mt-4 pt-4 border-t border-neutral-100">
<textarea
value={customInstructions}
@@ -405,38 +424,42 @@ const PodcastCreator: React.FC<PodcastCreatorProps> = ({
</button>
{/* 语言选择 */}
<div className="relative w-[120px]">
<select
value={language}
onChange={(e) => setLanguage(e.target.value)}
className="appearance-none w-full bg-white border border-neutral-200 rounded-lg px-3 py-2 pr-8 text-sm font-medium text-neutral-700 focus:outline-none focus:ring-2 focus:ring-purple-500 focus:border-transparent transition-all duration-200 shadow-sm hover:shadow-md hover:border-neutral-300 disabled:opacity-50 disabled:cursor-not-allowed"
disabled={isGenerating}
>
{languageOptions.map(option => (
<option key={option.value} value={option.value}>
{option.label}
</option>
))}
</select>
<AiOutlineDown className="absolute right-2 top-1/2 transform -translate-y-1/2 w-4 h-4 text-neutral-400 pointer-events-none" />
</div>
{selectedMode === 'ai-podcast' && (
<div className="relative w-[120px]">
<select
value={language}
onChange={(e) => setLanguage(e.target.value)}
className="appearance-none w-full bg-white border border-neutral-200 rounded-lg px-3 py-2 pr-8 text-sm font-medium text-neutral-700 focus:outline-none focus:ring-2 focus:ring-purple-500 focus:border-transparent transition-all duration-200 shadow-sm hover:shadow-md hover:border-neutral-300 disabled:opacity-50 disabled:cursor-not-allowed"
disabled={isGenerating}
>
{languageOptions.map(option => (
<option key={option.value} value={option.value}>
{option.label}
</option>
))}
</select>
<AiOutlineDown className="absolute right-2 top-1/2 transform -translate-y-1/2 w-4 h-4 text-neutral-400 pointer-events-none" />
</div>
)}
{/* 时长选择 */}
<div className="relative w-[120px]">
<select
value={duration}
onChange={(e) => setDuration(e.target.value as any)}
className="appearance-none w-full bg-white border border-neutral-200 rounded-lg px-3 py-2 pr-8 text-sm font-medium text-neutral-700 focus:outline-none focus:ring-2 focus:ring-purple-500 focus:border-transparent transition-all duration-200 shadow-sm hover:shadow-md hover:border-neutral-300 disabled:opacity-50 disabled:cursor-not-allowed"
disabled={isGenerating}
>
{durationOptions.map(option => (
<option key={option.value} value={option.value}>
{option.label}
</option>
))}
</select>
<AiOutlineDown className="absolute right-2 top-1/2 transform -translate-y-1/2 w-4 h-4 text-neutral-400 pointer-events-none" />
</div>
{selectedMode === 'ai-podcast' && (
<div className="relative w-[120px]">
<select
value={duration}
onChange={(e) => setDuration(e.target.value as any)}
className="appearance-none w-full bg-white border border-neutral-200 rounded-lg px-3 py-2 pr-8 text-sm font-medium text-neutral-700 focus:outline-none focus:ring-2 focus:ring-purple-500 focus:border-transparent transition-all duration-200 shadow-sm hover:shadow-md hover:border-neutral-300 disabled:opacity-50 disabled:cursor-not-allowed"
disabled={isGenerating}
>
{durationOptions.map(option => (
<option key={option.value} value={option.value}>
{option.label}
</option>
))}
</select>
<AiOutlineDown className="absolute right-2 top-1/2 transform -translate-y-1/2 w-4 h-4 text-neutral-400 pointer-events-none" />
</div>
)}
{/* 积分显示 */}
<div className="w-[120px] flex items-center justify-center gap-1.5 px-3 py-2 bg-white border border-neutral-200 rounded-lg shadow-sm">
@@ -575,9 +598,11 @@ const PodcastCreator: React.FC<PodcastCreatorProps> = ({
onConfirm={handleConfirmGenerate}
title={t('podcastCreator.confirmGeneration')}
message={t('podcastCreator.confirmGenerationMessage')}
points={duration === '8-15 minutes' ?
parseInt(process.env.POINTS_PER_PODCAST || '20', 10) * 2 :
parseInt(process.env.POINTS_PER_PODCAST || '20', 10)}
points={selectedMode === 'ai-story'
? 30
: (duration === '8-15 minutes'
? parseInt(process.env.POINTS_PER_PODCAST || '20', 10) * 2
: parseInt(process.env.POINTS_PER_PODCAST || '20', 10))}
lang={lang}
/>
</div>

View File

@@ -29,15 +29,17 @@ export default function PodcastTabs({ parsedScript, overviewContent, lang }: Pod
>
{t('podcastTabs.script')}
</button>
{/* 大纲 */}
<button
className={`py-4 px-1 text-base font-semibold ${
activeTab === 'overview' ? 'text-gray-900 border-b-2 border-gray-900' : 'text-gray-500 border-b-2 border-transparent hover:text-gray-900'
}`}
onClick={() => setActiveTab('overview')}
>
{t('podcastTabs.outline')}
</button>
{/* 大纲 - 仅在有内容时显示 */}
{overviewContent && (
<button
className={`py-4 px-1 text-base font-semibold ${
activeTab === 'overview' ? 'text-gray-900 border-b-2 border-gray-900' : 'text-gray-500 border-b-2 border-transparent hover:text-gray-900'
}`}
onClick={() => setActiveTab('overview')}
>
{t('podcastTabs.outline')}
</button>
)}
</div>
</div>
</div>

View File

@@ -71,32 +71,43 @@ const Sidebar: React.FC<SidebarProps> = ({
if (!didFetch.current) {
didFetch.current = true; // 标记为已执行,避免在开发模式下重复执行
const fetchSession = async () => {
const { session: fetchedSession, user: fetchedUser } = await getSessionData();
setSession(fetchedSession);
console.log('session', fetchedSession); // 确保只在 session 数据获取并设置后打印
try {
const { session: fetchedSession, user: fetchedUser } = await getSessionData();
setSession(fetchedSession);
console.log('session', fetchedSession); // 确保只在 session 数据获取并设置后打印
} catch (error) {
console.error('Failed to fetch session:', error);
// 如果获取 session 失败,不要无限重试
setSession(null);
}
};
fetchSession();
}
}, []); // 只在组件挂载时执行一次
// 检查 session 是否过期
if (session?.expiresAt) {
const expirationTime = session.expiresAt.getTime();
const currentTime = new Date().getTime();
// 单独的 effect 用于检查 session 过期
useEffect(() => {
if (!session?.expiresAt) return;
if (currentTime > expirationTime) {
console.log(t('sidebar.sessionExpired'));
signOut({
fetchOptions: {
onSuccess: () => {
setSession(null); // 会话过期,注销成功后清空本地 session 状态
onCreditsChange(0); // 清空积分
router.push(truePath+"/"); // 会话过期,执行注销并重定向到主页
},
const expirationTime = session.expiresAt.getTime();
const currentTime = new Date().getTime();
if (currentTime > expirationTime) {
console.log(t('sidebar.sessionExpired'));
signOut({
fetchOptions: {
onSuccess: () => {
setSession(null); // 会话过期,注销成功后清空本地 session 状态
onCreditsChange(0); // 清空积分
router.push(truePath+"/"); // 会话过期,执行注销并重定向到主页
},
});
}
onError: (error) => {
console.error('Sign out error:', error);
},
},
});
}
}, [session, router, onCreditsChange, t]); // 监听 session 变化和 router因为 signOut 中使用了 router.push并添加 onCreditsChange
}, [session?.expiresAt, router, onCreditsChange, t, truePath]); // 监听必要的依赖
// todo
const mainNavItems: NavItem[] = [

View File

@@ -4,5 +4,10 @@ import { usernameClient } from "better-auth/client/plugins";
export const { signIn, signUp, signOut, useSession, updateUser, changeEmail, changePassword} =
createAuthClient({
plugins: [usernameClient()],
baseURL: process.env.BETTER_AUTH_URL!,
baseURL: process.env.NEXT_PUBLIC_BASE_URL || process.env.BETTER_AUTH_URL || 'http://localhost:3000',
fetchOptions: {
onError: (ctx) => {
console.error('Auth client error:', ctx.error);
},
},
});

View File

@@ -38,6 +38,41 @@ export async function startPodcastGenerationTask(body: PodcastGenerationRequest,
}
}
/**
* 启动沉浸故事模式的播客生成任务(不传递自定义指令、语言和时长参数)
*/
export async function startPodcastWithStoryGenerationTask(body: PodcastGenerationRequest, userId: string, lang: string): Promise<ApiResponse<PodcastGenerationResponse>> {
body.lang = lang;
try {
// 创建一个新的请求体,排除 customInstructions、output_language 和 usetime 参数
const { output_language, usetime, ...storyBody } = body;
const response = await fetch(`${API_BASE_URL}/generate-podcast-with-story`, {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'X-Auth-Id': userId,
},
body: new URLSearchParams(Object.entries(storyBody).map(([key, value]) => [key, String(value)])),
});
if (!response.ok) {
const errorData = await response.json().catch(() => ({ detail: `请求失败,状态码: ${response.status}` }));
throw new HttpError(errorData.detail || `请求失败,状态码: ${response.status}`, response.status);
}
const result: PodcastGenerationResponse = await response.json();
// 确保id字段存在因为它在前端被广泛使用
result.id = result.task_id;
return { success: true, data: result };
} catch (error: any) {
console.error('Error in startPodcastWithStoryGenerationTask:', error);
const statusCode = error instanceof HttpError ? error.statusCode : undefined;
return { success: false, error: error.message || '启动沉浸故事生成任务失败', statusCode };
}
}
/**
* 获取播客生成任务状态
*/

View File

@@ -11,6 +11,7 @@ export interface PodcastGenerationRequest {
usetime?: string; // 时长 来自选择
output_language?: string; // 语言 来自设置
lang?: string; // 子路径表示语言
mode?: 'ai-podcast' | 'ai-story'; // 模式标识AI播客或沉浸故事
}
export interface PodcastGenerationResponse {