# podcast_generator.py import argparse # Import argparse for command-line arguments import os import json import time import glob import sys import subprocess # For calling external commands like ffmpeg import requests # For making HTTP requests to TTS API import uuid # For generating unique filenames for temporary audio files from datetime import datetime from openai_cli import OpenAICli # Moved to top for proper import import urllib.parse # For URL encoding import re # For regular expression operations # Global configuration output_dir = "output" file_list_path = os.path.join(output_dir, "file_list.txt") def read_file_content(filepath): """Reads content from a given file path.""" try: with open(filepath, 'r', encoding='utf-8') as f: return f.read() except FileNotFoundError: print(f"Error: File not found at {filepath}") sys.exit(1) def select_json_config(config_dir='config'): """ Reads JSON files from the specified directory and allows the user to select one. Returns the content of the selected JSON file. """ json_files = glob.glob(os.path.join(config_dir, '*.json')) if not json_files: print(f"Error: No JSON files found in {config_dir}") sys.exit(1) print(f"Found JSON configuration files in '{config_dir}':") for i, file_path in enumerate(json_files): print(f"{i + 1}. {os.path.basename(file_path)}") while True: try: choice = int(input("Enter the number of the configuration file to use: ")) if 1 <= choice <= len(json_files): selected_file = json_files[choice - 1] print(f"Selected: {os.path.basename(selected_file)}") with open(selected_file, 'r', encoding='utf-8') as f: return json.load(f) else: print("Invalid choice. Please enter a number within the range.") except ValueError: print("Invalid input. Please enter a number.") def generate_speaker_id_text(pod_users, voices_list): """ Generates a text string mapping speaker IDs to their names/aliases based on podUsers and voices. Optimized by converting voices_list to a dictionary for faster lookups. """ voice_map = {voice.get("code"): voice for voice in voices_list if voice.get("code")} speaker_info = [] for speaker_id, pod_user in enumerate(pod_users): pod_user_code = pod_user.get("code") role = pod_user.get("role", "") # Default to "未知角色" if role is not provided found_name = None voice = voice_map.get(pod_user_code) if voice: found_name = voice.get("usedname") or voice.get("alias") or voice.get("name") if found_name: if role: speaker_info.append(f"speaker_id={speaker_id}的名叫{found_name},是一个{role}") else: speaker_info.append(f"speaker_id={speaker_id}的名叫{found_name}") else: raise ValueError(f"语音code '{pod_user_code}' (speaker_id={speaker_id}) 未找到对应名称或alias。请检查 config/edge-tts.json 中的 voices 配置。") return "。".join(speaker_info) + "。" def merge_audio_files(): output_audio_filename = f"podcast_{int(time.time())}.wav" # Use ffmpeg to concatenate audio files # Check if ffmpeg is available try: subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True) except FileNotFoundError: print("Error: FFmpeg is not installed or not in your PATH. Please install FFmpeg to merge audio files.") print("You can download FFmpeg from: https://ffmpeg.org/download.html") sys.exit(1) print(f"\nMerging audio files into {output_audio_filename}...") try: command = [ "ffmpeg", "-f", "concat", "-safe", "0", "-i", os.path.basename(file_list_path), "-acodec", "pcm_s16le", "-ar", "44100", "-ac", "2", output_audio_filename ] # Execute ffmpeg from the output_dir to correctly resolve file paths in file_list.txt process = subprocess.run(command, check=True, cwd=output_dir, capture_output=True, text=True) print("Audio files merged successfully!") print("FFmpeg stdout:\n", process.stdout) print("FFmpeg stderr:\n", process.stderr) except subprocess.CalledProcessError as e: print(f"Error merging audio files with FFmpeg: {e}") print(f"FFmpeg stdout:\n", e.stdout) print(f"FFmpeg stderr:\n", e.stderr) sys.exit(1) finally: # Clean up temporary audio files and the file list # Clean up temporary audio files and the file list for item in os.listdir(output_dir): if item.startswith("temp_audio"): try: os.remove(os.path.join(output_dir, item)) except OSError as e: print(f"Error removing temporary audio file {item}: {e}") try: os.remove(file_list_path) except OSError as e: print(f"Error removing file list {file_list_path}: {e}") print("Cleaned up temporary files.") def main(): # Parse command-line arguments parser = argparse.ArgumentParser(description="Generate podcast script and audio using OpenAI and local TTS.") parser.add_argument("--api-key", help="OpenAI API key.") parser.add_argument("--base-url", default="https://api.openai.com/v1", help="OpenAI API base URL (default: https://api.openai.com/v1).") parser.add_argument("--model", default="gpt-3.5-turbo", help="OpenAI model to use (default: gpt-3.5-turbo).") parser.add_argument("--threads", type=int, default=1, help="Number of threads to use for audio generation (default: 1).") args = parser.parse_args() print("Podcast Generation Script") # Step 1: Select JSON configuration config_data = select_json_config() print("\nLoaded Configuration:") # print(json.dumps(config_data, indent=4)) # Determine final API key, base URL, and model based on priority # Command-line args > config file > environment variables api_key = args.api_key or config_data.get("api_key") or os.getenv("OPENAI_API_KEY") base_url = args.base_url or config_data.get("base_url") or os.getenv("OPENAI_BASE_URL") model = args.model or config_data.get("model") # Allow model to be None if not provided anywhere # Fallback for model if not specified if not model: model = "gpt-3.5-turbo" print(f"Using default model: {model} as it was not specified via command-line, config, or environment variables.") if not api_key: print("Error: OpenAI API key is not set. Please provide it via --api-key, in your config file, or as an environment variable (OPENAI_API_KEY).") sys.exit(1) # Step 2: Read prompt files input_prompt = read_file_content('input.txt') overview_prompt = read_file_content('prompt/prompt-overview.txt') original_podscript_prompt = read_file_content('prompt/prompt-podscript.txt') # 从 input_prompt 中提取自定义内容 custom_content = "" custom_begin_tag = '```custom-begin' custom_end_tag = '```custom-end' start_index = input_prompt.find(custom_begin_tag) if start_index != -1: end_index = input_prompt.find(custom_end_tag, start_index + len(custom_begin_tag)) if end_index != -1: custom_content = input_prompt[start_index + len(custom_begin_tag):end_index].strip() # 移除 input_prompt 中 ```custom-end 以上的部分,包含 ```custom-end input_prompt = input_prompt[end_index + len(custom_end_tag):].strip() pod_users = config_data.get("podUsers", []) voices = config_data.get("voices", []) turn_pattern = config_data.get("turnPattern", "random") # 替换 original_podscript_prompt 中的占位符 original_podscript_prompt = original_podscript_prompt.replace("{{numSpeakers}}", str(len(pod_users))) original_podscript_prompt = original_podscript_prompt.replace("{{turnPattern}}", turn_pattern) speaker_id_info = generate_speaker_id_text(pod_users, voices) # 将自定义内容前置到 podscript_prompt podscript_prompt = speaker_id_info + "\n\n" + original_podscript_prompt + "\n\n" + custom_content print(f"\nInput Prompt (input.txt):\n{input_prompt[:100]}...") # Display first 100 chars print(f"\nOverview Prompt (prompt-overview.txt):\n{overview_prompt[:100]}...") print(f"\nPodscript Prompt (prompt-podscript.txt):\n{podscript_prompt[:1000]}...") # Step 4 & 5: Call openai_cli to generate overview content print("\nGenerating overview with OpenAI CLI...") try: openai_client_overview = OpenAICli(api_key=api_key, base_url=base_url, model=model, system_message=overview_prompt) overview_response_generator = openai_client_overview.chat_completion(messages=[{"role": "user", "content": input_prompt}]) overview_content = "".join([chunk.choices[0].delta.content for chunk in overview_response_generator if chunk.choices and chunk.choices[0].delta.content]) print("Generated Overview:") print(overview_content[:100]) except Exception as e: print(f"Error generating overview: {e}") sys.exit(1) # Step 6: Call openai_cli to generate podcast script JSON print("\nGenerating podcast script with OpenAI CLI...") try: openai_client_podscript = OpenAICli(api_key=api_key, base_url=base_url, model=model, system_message=podscript_prompt) podscript_response_generator = openai_client_podscript.chat_completion(messages=[{"role": "user", "content": overview_content}]) podscript_json_str = "".join([chunk.choices[0].delta.content for chunk in podscript_response_generator if chunk.choices and chunk.choices[0].delta.content]) # try: # output_script_filename = os.path.join(output_dir, f"podcast_script_{int(time.time())}.json") # with open(output_script_filename, 'w', encoding='utf-8') as f: # json.dump(podscript_json_str, f, ensure_ascii=False, indent=4) # print(f"Podcast script saved to {output_script_filename}") # except Exception as e: # print(f"Error saving podcast script to file: {e}") # sys.exit(1) # Attempt to parse the JSON string. OpenAI sometimes returns extra text. podcast_script = None decoder = json.JSONDecoder() idx = 0 valid_json_str = "" while idx < len(podscript_json_str): try: obj, end = decoder.raw_decode(podscript_json_str[idx:]) # Check if this object is the expected podcast_script if isinstance(obj, dict) and "podcast_transcripts" in obj: podcast_script = obj valid_json_str = podscript_json_str[idx : idx + end] # Capture the exact valid JSON string break # Found the desired JSON, stop searching idx += end # Move to the end of the current JSON object except json.JSONDecodeError: # If decoding fails, advance index by one and continue idx += 1 # Optionally, skip to the next potential JSON start if it's far away next_brace = podscript_json_str.find('{', idx) if next_brace != -1: idx = next_brace else: break # No more braces, no more JSON to find if podcast_script is None: print(f"Error: Could not find a valid podcast script JSON object with 'podcast_transcripts' key in response.") print(f"Raw response: {podscript_json_str}") sys.exit(1) print("\nGenerated Podcast Script Length:"+ str(len(podcast_script.get("podcast_transcripts") or []))) print(valid_json_str[:100] + "...") # Print beginning of the *actual* parsed JSON if not podcast_script.get("podcast_transcripts"): print("Warning: 'podcast_transcripts' array is empty or not found in the generated script. Nothing to convert to audio.") sys.exit(0) # Exit gracefully if no transcripts to process except Exception as e: print(f"Error generating podcast script: {e}") sys.exit(1) # Step 7: Parse podcast script and generate audio os.makedirs(output_dir, exist_ok=True) # Create output directory if it doesn't exist def generate_audio_for_item(item, index): """Generate audio for a single podcast transcript item.""" speaker_id = item.get("speaker_id") dialog = item.get("dialog") # Get the voice code based on speaker_id (index into config_data["person"]) # Assuming speaker_id corresponds to the index in the 'person' array voice_code = None if config_data and "podUsers" in config_data and 0 <= speaker_id < len(config_data["podUsers"]): pod_user_entry = config_data["podUsers"][speaker_id] voice_code = pod_user_entry.get("code") if not voice_code: print(f"Warning: No voice code found for speaker_id {speaker_id}. Skipping this dialog.") return None # Replace placeholders in apiUrl # URL encode the dialog before replacing {{text}} # 移除指定标点符号,只保留逗号,句号,感叹号 dialog = re.sub(r'[^\w\s\-,,.。??!!\u4e00-\u9fa5]', '', dialog) print(f"dialog: {dialog}") encoded_dialog = urllib.parse.quote(dialog) api_url = config_data.get("apiUrl", "").replace("{{text}}", encoded_dialog).replace("{{voiceCode}}", voice_code) if not api_url: print(f"Warning: apiUrl not found in config. Skipping dialog for speaker_id {speaker_id}.") return None try: print(f"Calling TTS API for speaker {speaker_id} with voice {voice_code}...") response = requests.get(api_url, stream=True) response.raise_for_status() # Raise an exception for bad status codes # Save the audio chunk to a temporary file temp_audio_file = os.path.join(output_dir, f"temp_audio_{uuid.uuid4()}.mp3") with open(temp_audio_file, 'wb') as f: for chunk in response.iter_content(chunk_size=8192): f.write(chunk) print(f"Generated {os.path.basename(temp_audio_file)}") return temp_audio_file except requests.exceptions.RequestException as e: print(f"Error calling TTS API for speaker {speaker_id} ({voice_code}): {e}") return None print("\nGenerating audio files...") transcripts = podcast_script.get("podcast_transcripts", []) # Use ThreadPoolExecutor for multi-threading audio generation from concurrent.futures import ThreadPoolExecutor, as_completed # Create a dictionary to hold results with their indices audio_files_dict = {} with ThreadPoolExecutor(max_workers=args.threads) as executor: # Submit all tasks with their indices future_to_index = { executor.submit(generate_audio_for_item, item, i): i for i, item in enumerate(transcripts) } # Collect results and place them in the correct order for future in as_completed(future_to_index): index = future_to_index[future] try: result = future.result() if result: audio_files_dict[index] = result except Exception as e: print(f"Error generating audio for item {index}: {e}") # Convert dictionary to list in the correct order audio_files = [audio_files_dict[i] for i in sorted(audio_files_dict.keys())] print(f"\nFinished generating individual audio files. Total files: {len(audio_files)}") """ Merges a list of audio files into a single output file using FFmpeg. Args: audio_files (list): A list of paths to the audio files to merge. output_dir (str): The directory where the merged audio file will be saved. """ if not audio_files: print("No audio files were generated to merge.") return # Create a file list for ffmpeg print(f"Creating file list for ffmpeg at: {file_list_path}") with open(file_list_path, 'w', encoding='utf-8') as f: for audio_file in audio_files: # FFmpeg concat demuxer requires paths to be relative to the file_list.txt # or absolute. Using basename if file_list.txt is in output_dir. f.write(f"file '{os.path.basename(audio_file)}'\n") print("Content of file_list.txt:") with open(file_list_path, 'r', encoding='utf-8') as f: print(f.read()) if __name__ == "__main__": start_time = time.time() # Record the start time main() merge_audio_files() end_time = time.time() # Record the end time execution_time = end_time - start_time # Calculate total execution time print(f"\nTotal execution time: {execution_time:.2f} seconds")