import requests import json import os import shutil import subprocess from datetime import datetime from decouple import config from motor.motor_asyncio import AsyncIOMotorClient from typing import Dict, List import logging from fastapi import HTTPException, status logger = logging.getLogger(__name__) class Settings: MONGODB_URL = config('MONGODB_URL') SECRET_KEY = config('SECRET_KEY') OPENAI_API_KEY = config('OPENAI_API_KEY') # Other settings... settings = Settings() client = AsyncIOMotorClient(settings.MONGODB_URL) db = client.podcraft podcasts = db.podcasts class PodcastManager: def __init__(self): self.tts_url = "https://api.openai.com/v1/audio/speech" self.headers = { "Authorization": f"Bearer {settings.OPENAI_API_KEY}", "Content-Type": "application/json" } # Create absolute path for temp directory self.temp_dir = os.path.abspath("temp_audio") os.makedirs(self.temp_dir, exist_ok=True) # Define allowed voices self.allowed_voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer", "ash", "sage", "coral"] def generate_speech(self, text: str, voice_id: str, filename: str) -> bool: """Generate speech using OpenAI's TTS API.""" try: # Debug logging for voice selection print(f"\n=== TTS Generation Details ===") print(f"File: {filename}") print(f"Voice ID (original): {voice_id}") print(f"Voice ID (lowercase): {voice_id.lower()}") print(f"Allowed voices: {self.allowed_voices}") # Validate and normalize voice_id voice = voice_id.lower().strip() if voice not in self.allowed_voices: print(f"Warning: Invalid voice ID: {voice_id}. Using default voice 'alloy'") voice = "alloy" print(f"Final voice selection: {voice}") # Ensure the output directory exists output_dir = os.path.dirname(filename) os.makedirs(output_dir, exist_ok=True) payload = { "model": "tts-1", "input": text, "voice": voice } print(f"TTS API payload: {json.dumps(payload, indent=2)}") print(f"Request headers: {json.dumps({k: '***' if k == 'Authorization' else v for k, v in self.headers.items()}, indent=2)}") response = requests.post(self.tts_url, json=payload, headers=self.headers) if response.status_code != 200: print(f"API error response: {response.status_code} - {response.text}") return False # Write the audio content to the file with open(filename, "wb") as f: f.write(response.content) print(f"Successfully generated speech file: {filename}") print(f"File size: {os.path.getsize(filename)} bytes") # Verify the file exists and has content if not os.path.exists(filename) or os.path.getsize(filename) == 0: print(f"Error: Generated file is empty or does not exist: {filename}") return False return True except Exception as e: print(f"Error generating speech: {str(e)}") logger.exception(f"Error generating speech: {str(e)}") return False def merge_audio_files(self, audio_files: List[str], output_file: str) -> bool: """Merge multiple audio files into one using ffmpeg.""" try: # Ensure output directory exists output_dir = os.path.dirname(os.path.abspath(output_file)) os.makedirs(output_dir, exist_ok=True) if not audio_files: print("No audio files to merge") return False # Verify all input files exist for audio_file in audio_files: if not os.path.exists(audio_file): print(f"Audio file does not exist: {audio_file}") return False # Ensure all paths are absolute output_file = os.path.abspath(output_file) output_dir = os.path.dirname(output_file) os.makedirs(output_dir, exist_ok=True) # Create temporary files in the same directory list_file = os.path.join(output_dir, "files.txt") silence_file = os.path.join(output_dir, "silence.mp3") print(f"Output directory: {output_dir}") print(f"List file: {list_file}") print(f"Silence file: {silence_file}") # Generate shorter silence file (0.3 seconds instead of 1 second) silence_result = subprocess.run([ 'ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=mono', '-t', '0.3', '-q:a', '9', '-acodec', 'libmp3lame', silence_file ], capture_output=True, text=True) if silence_result.returncode != 0: print(f"Error generating silence file: {silence_result.stderr}") return False if not os.path.exists(silence_file): print("Failed to create silence file") return False # IMPORTANT: The order here determines the final audio order print("\nGenerating files list in exact provided order:") try: with open(list_file, "w", encoding='utf-8') as f: for i, audio_file in enumerate(audio_files): abs_audio_path = os.path.abspath(audio_file) print(f"{i+1}. Adding audio file: {os.path.basename(abs_audio_path)}") # Use forward slashes for ffmpeg compatibility abs_audio_path = abs_audio_path.replace('\\', '/') silence_path = silence_file.replace('\\', '/') f.write(f"file '{abs_audio_path}'\n") # Add a shorter silence after each audio segment (except the last one) if i < len(audio_files) - 1: f.write(f"file '{silence_path}'\n") except Exception as e: print(f"Error writing list file: {str(e)}") return False if not os.path.exists(list_file): print("Failed to create list file") return False # Print the contents of the list file for debugging print("\nContents of files.txt:") with open(list_file, 'r', encoding='utf-8') as f: print(f.read()) # Merge all files using the concat demuxer with optimized settings try: # Use concat demuxer with additional parameters for better playback result = subprocess.run( ['ffmpeg', '-f', 'concat', '-safe', '0', '-i', list_file, '-c:a', 'libmp3lame', '-q:a', '4', '-ar', '44100', output_file], capture_output=True, text=True, check=True ) except subprocess.CalledProcessError as e: logger.error(f"FFmpeg command failed: {e.stderr}") return False # Verify the output file was created if not os.path.exists(output_file): print("Failed to create output file") return False print(f"Successfully created merged audio file: {output_file}") return True except Exception as e: print(f"Error merging audio files: {str(e)}") return False async def create_podcast( self, topic: str, research: str, conversation_blocks: List[Dict], believer_voice_id: str, skeptic_voice_id: str, user_id: str = None ) -> Dict: """Create a podcast by converting text to speech and storing the results.""" podcast_temp_dir = None try: # Debug logging for voice IDs print(f"\nPodcast Creation - Voice Configuration:") print(f"Believer Voice ID: {believer_voice_id}") print(f"Skeptic Voice ID: {skeptic_voice_id}") # Create a unique directory with absolute path timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") podcast_temp_dir = os.path.abspath(os.path.join(self.temp_dir, timestamp)) os.makedirs(podcast_temp_dir, exist_ok=True) print(f"Created temp directory: {podcast_temp_dir}") print(f"Processing conversation blocks: {json.dumps(conversation_blocks, indent=2)}") audio_files = [] # Process the blocks differently based on format: # 1. New turn-based format with "type" and "turn" fields # 2. Blocks with "input" field but no turn-based structure (old format) # 3. Blocks with both "input" field and turn-based structure (mixed format) # First check: New format blocks with type and turn if any("type" in block and "turn" in block and "content" in block for block in conversation_blocks): print("\nProcessing new format blocks with type, turn, and content fields") # Process conversation blocks in the EXACT order they were provided # This ensures proper alternation between speakers as specified by the caller for idx, block in enumerate(conversation_blocks): if "type" in block and "content" in block and "turn" in block: turn = block.get("turn", 0) agent_type = block.get("type", "") content = block.get("content", "") if not content.strip(): # Skip empty content continue # Use the correct voice based on agent type voice_id = believer_voice_id if agent_type == "believer" else skeptic_voice_id file_prefix = "believer" if agent_type == "believer" else "skeptic" # Create a unique filename with turn number audio_file = os.path.join(podcast_temp_dir, f"{file_prefix}_turn_{turn}_{idx}.mp3") print(f"\nProcessing {agent_type} turn {turn} (index {idx}) with voice {voice_id}") print(f"Content preview: {content[:100]}...") if self.generate_speech(content, voice_id, audio_file): # Add to our audio files list IN THE ORIGINAL ORDER audio_files.append(audio_file) print(f"Generated {agent_type} audio for turn {turn}, added to position {len(audio_files)}") else: raise Exception(f"Failed to generate audio for {agent_type} turn {turn}") # Second check: Blocks with input field and possibly turn information elif any("input" in block for block in conversation_blocks): print("\nProcessing blocks with input field") # Check if these blocks also have type and turn information has_turn_info = any("turn" in block and "type" in block for block in conversation_blocks) if has_turn_info: print("Blocks have both input field and turn-based structure - using mixed format") # Sort by turn if available, ensuring proper sequence sorted_blocks = sorted(conversation_blocks, key=lambda b: b.get("turn", float('inf'))) for idx, block in enumerate(sorted_blocks): if "input" in block and block["input"].strip(): # Determine voice based on type field or name if "type" in block: is_believer = block["type"] == "believer" else: is_believer = "Believer" in block.get("name", "") or block.get("name", "").lower().startswith("alloy") voice_id = believer_voice_id if is_believer else skeptic_voice_id speaker_type = "believer" if is_believer else "skeptic" turn = block.get("turn", idx + 1) print(f"\nProcessing {speaker_type} block with turn {turn} using voice {voice_id}") audio_file = os.path.join(podcast_temp_dir, f"{speaker_type}_turn_{turn}_{idx}.mp3") if self.generate_speech(block["input"], voice_id, audio_file): audio_files.append(audio_file) print(f"Generated audio for {speaker_type} turn {turn}") else: raise Exception(f"Failed to generate audio for {speaker_type} turn {turn}") else: # Old format - process blocks sequentially as they appear print("Processing old format blocks sequentially") for i, block in enumerate(conversation_blocks): if "input" in block and block["input"].strip(): # Check for either "Believer" in name or if the name starts with "alloy" is_believer = "Believer" in block.get("name", "") or block.get("name", "").lower().startswith("alloy") voice_id = believer_voice_id if is_believer else skeptic_voice_id speaker_type = "believer" if is_believer else "skeptic" print(f"\nProcessing {speaker_type} block {i+1} with voice {voice_id}") print(f"Block name: {block.get('name', '')}") # Debug logging audio_file = os.path.join(podcast_temp_dir, f"part_{i+1}.mp3") if self.generate_speech(block["input"], voice_id, audio_file): audio_files.append(audio_file) print(f"Generated audio for part {i+1}") else: raise Exception(f"Failed to generate audio for part {i+1}") else: raise Exception("Invalid conversation blocks format - no recognizable structure found") if not audio_files: raise Exception("No audio files were generated from the conversation blocks") print(f"\nGenerated {len(audio_files)} audio files in total") # Print the final order of audio files for verification print("\nFinal audio file order before merging:") for i, file in enumerate(audio_files): print(f"{i+1}. {os.path.basename(file)}") # Merge all audio files final_audio = os.path.join(podcast_temp_dir, "final_podcast.mp3") print(f"Merging to final audio: {final_audio}") if not self.merge_audio_files(audio_files, final_audio): raise Exception("Failed to merge audio files") # Calculate audio duration using ffprobe duration = 0 try: cmd = [ 'ffprobe', '-v', 'error', '-show_entries', 'format=duration', '-of', 'default=noprint_wrappers=1:nokey=1', final_audio ] duration_result = subprocess.run(cmd, capture_output=True, text=True) if duration_result.returncode == 0: duration = float(duration_result.stdout.strip()) print(f"Audio duration: {duration} seconds") else: print(f"Failed to get audio duration: {duration_result.stderr}") except Exception as e: print(f"Error calculating duration: {str(e)}") # Don't fail the entire process for duration calculation podcast_doc = { "topic": topic, "research": research, "conversation_blocks": conversation_blocks, "audio_path": final_audio, "created_at": datetime.utcnow(), "believer_voice_id": believer_voice_id, "skeptic_voice_id": skeptic_voice_id, "user_id": user_id, "duration": duration # Add duration to MongoDB document } result = await podcasts.insert_one(podcast_doc) # Clean up individual audio files but keep the final one for audio_file in audio_files: if os.path.exists(audio_file): os.remove(audio_file) return { "podcast_id": str(result.inserted_id), "audio_path": final_audio, "topic": topic, "duration": duration # Return duration in the result } except Exception as e: # Clean up the temp directory in case of error if os.path.exists(podcast_temp_dir): shutil.rmtree(podcast_temp_dir) logger.exception(f"Error in podcast creation: {str(e)}") return { "error": str(e) } async def get_podcast(self, podcast_id: str) -> Dict: """Retrieve a podcast by ID.""" try: from bson.objectid import ObjectId podcast = await podcasts.find_one({"_id": ObjectId(podcast_id)}) if podcast: podcast["_id"] = str(podcast["_id"]) return podcast return {"error": "Podcast not found"} except Exception as e: return {"error": str(e)}