podcraft_web_app / backend /app /agents /podcast_manager.py
Nagesh Muralidhar
Initial commit of PodCraft application
fd52f31
import requests
import json
import os
import shutil
import subprocess
from datetime import datetime
from decouple import config
from motor.motor_asyncio import AsyncIOMotorClient
from typing import Dict, List
import logging
from fastapi import HTTPException, status
logger = logging.getLogger(__name__)
class Settings:
MONGODB_URL = config('MONGODB_URL')
SECRET_KEY = config('SECRET_KEY')
OPENAI_API_KEY = config('OPENAI_API_KEY')
# Other settings...
settings = Settings()
client = AsyncIOMotorClient(settings.MONGODB_URL)
db = client.podcraft
podcasts = db.podcasts
class PodcastManager:
def __init__(self):
self.tts_url = "https://api.openai.com/v1/audio/speech"
self.headers = {
"Authorization": f"Bearer {settings.OPENAI_API_KEY}",
"Content-Type": "application/json"
}
# Create absolute path for temp directory
self.temp_dir = os.path.abspath("temp_audio")
os.makedirs(self.temp_dir, exist_ok=True)
# Define allowed voices
self.allowed_voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer", "ash", "sage", "coral"]
def generate_speech(self, text: str, voice_id: str, filename: str) -> bool:
"""Generate speech using OpenAI's TTS API."""
try:
# Debug logging for voice selection
print(f"\n=== TTS Generation Details ===")
print(f"File: {filename}")
print(f"Voice ID (original): {voice_id}")
print(f"Voice ID (lowercase): {voice_id.lower()}")
print(f"Allowed voices: {self.allowed_voices}")
# Validate and normalize voice_id
voice = voice_id.lower().strip()
if voice not in self.allowed_voices:
print(f"Warning: Invalid voice ID: {voice_id}. Using default voice 'alloy'")
voice = "alloy"
print(f"Final voice selection: {voice}")
# Ensure the output directory exists
output_dir = os.path.dirname(filename)
os.makedirs(output_dir, exist_ok=True)
payload = {
"model": "tts-1",
"input": text,
"voice": voice
}
print(f"TTS API payload: {json.dumps(payload, indent=2)}")
print(f"Request headers: {json.dumps({k: '***' if k == 'Authorization' else v for k, v in self.headers.items()}, indent=2)}")
response = requests.post(self.tts_url, json=payload, headers=self.headers)
if response.status_code != 200:
print(f"API error response: {response.status_code} - {response.text}")
return False
# Write the audio content to the file
with open(filename, "wb") as f:
f.write(response.content)
print(f"Successfully generated speech file: {filename}")
print(f"File size: {os.path.getsize(filename)} bytes")
# Verify the file exists and has content
if not os.path.exists(filename) or os.path.getsize(filename) == 0:
print(f"Error: Generated file is empty or does not exist: {filename}")
return False
return True
except Exception as e:
print(f"Error generating speech: {str(e)}")
logger.exception(f"Error generating speech: {str(e)}")
return False
def merge_audio_files(self, audio_files: List[str], output_file: str) -> bool:
"""Merge multiple audio files into one using ffmpeg."""
try:
# Ensure output directory exists
output_dir = os.path.dirname(os.path.abspath(output_file))
os.makedirs(output_dir, exist_ok=True)
if not audio_files:
print("No audio files to merge")
return False
# Verify all input files exist
for audio_file in audio_files:
if not os.path.exists(audio_file):
print(f"Audio file does not exist: {audio_file}")
return False
# Ensure all paths are absolute
output_file = os.path.abspath(output_file)
output_dir = os.path.dirname(output_file)
os.makedirs(output_dir, exist_ok=True)
# Create temporary files in the same directory
list_file = os.path.join(output_dir, "files.txt")
silence_file = os.path.join(output_dir, "silence.mp3")
print(f"Output directory: {output_dir}")
print(f"List file: {list_file}")
print(f"Silence file: {silence_file}")
# Generate shorter silence file (0.3 seconds instead of 1 second)
silence_result = subprocess.run([
'ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=mono',
'-t', '0.3', '-q:a', '9', '-acodec', 'libmp3lame', silence_file
], capture_output=True, text=True)
if silence_result.returncode != 0:
print(f"Error generating silence file: {silence_result.stderr}")
return False
if not os.path.exists(silence_file):
print("Failed to create silence file")
return False
# IMPORTANT: The order here determines the final audio order
print("\nGenerating files list in exact provided order:")
try:
with open(list_file, "w", encoding='utf-8') as f:
for i, audio_file in enumerate(audio_files):
abs_audio_path = os.path.abspath(audio_file)
print(f"{i+1}. Adding audio file: {os.path.basename(abs_audio_path)}")
# Use forward slashes for ffmpeg compatibility
abs_audio_path = abs_audio_path.replace('\\', '/')
silence_path = silence_file.replace('\\', '/')
f.write(f"file '{abs_audio_path}'\n")
# Add a shorter silence after each audio segment (except the last one)
if i < len(audio_files) - 1:
f.write(f"file '{silence_path}'\n")
except Exception as e:
print(f"Error writing list file: {str(e)}")
return False
if not os.path.exists(list_file):
print("Failed to create list file")
return False
# Print the contents of the list file for debugging
print("\nContents of files.txt:")
with open(list_file, 'r', encoding='utf-8') as f:
print(f.read())
# Merge all files using the concat demuxer with optimized settings
try:
# Use concat demuxer with additional parameters for better playback
result = subprocess.run(
['ffmpeg', '-f', 'concat', '-safe', '0', '-i', list_file,
'-c:a', 'libmp3lame', '-q:a', '4', '-ar', '44100',
output_file],
capture_output=True,
text=True,
check=True
)
except subprocess.CalledProcessError as e:
logger.error(f"FFmpeg command failed: {e.stderr}")
return False
# Verify the output file was created
if not os.path.exists(output_file):
print("Failed to create output file")
return False
print(f"Successfully created merged audio file: {output_file}")
return True
except Exception as e:
print(f"Error merging audio files: {str(e)}")
return False
async def create_podcast(
self,
topic: str,
research: str,
conversation_blocks: List[Dict],
believer_voice_id: str,
skeptic_voice_id: str,
user_id: str = None
) -> Dict:
"""Create a podcast by converting text to speech and storing the results."""
podcast_temp_dir = None
try:
# Debug logging for voice IDs
print(f"\nPodcast Creation - Voice Configuration:")
print(f"Believer Voice ID: {believer_voice_id}")
print(f"Skeptic Voice ID: {skeptic_voice_id}")
# Create a unique directory with absolute path
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
podcast_temp_dir = os.path.abspath(os.path.join(self.temp_dir, timestamp))
os.makedirs(podcast_temp_dir, exist_ok=True)
print(f"Created temp directory: {podcast_temp_dir}")
print(f"Processing conversation blocks: {json.dumps(conversation_blocks, indent=2)}")
audio_files = []
# Process the blocks differently based on format:
# 1. New turn-based format with "type" and "turn" fields
# 2. Blocks with "input" field but no turn-based structure (old format)
# 3. Blocks with both "input" field and turn-based structure (mixed format)
# First check: New format blocks with type and turn
if any("type" in block and "turn" in block and "content" in block for block in conversation_blocks):
print("\nProcessing new format blocks with type, turn, and content fields")
# Process conversation blocks in the EXACT order they were provided
# This ensures proper alternation between speakers as specified by the caller
for idx, block in enumerate(conversation_blocks):
if "type" in block and "content" in block and "turn" in block:
turn = block.get("turn", 0)
agent_type = block.get("type", "")
content = block.get("content", "")
if not content.strip(): # Skip empty content
continue
# Use the correct voice based on agent type
voice_id = believer_voice_id if agent_type == "believer" else skeptic_voice_id
file_prefix = "believer" if agent_type == "believer" else "skeptic"
# Create a unique filename with turn number
audio_file = os.path.join(podcast_temp_dir, f"{file_prefix}_turn_{turn}_{idx}.mp3")
print(f"\nProcessing {agent_type} turn {turn} (index {idx}) with voice {voice_id}")
print(f"Content preview: {content[:100]}...")
if self.generate_speech(content, voice_id, audio_file):
# Add to our audio files list IN THE ORIGINAL ORDER
audio_files.append(audio_file)
print(f"Generated {agent_type} audio for turn {turn}, added to position {len(audio_files)}")
else:
raise Exception(f"Failed to generate audio for {agent_type} turn {turn}")
# Second check: Blocks with input field and possibly turn information
elif any("input" in block for block in conversation_blocks):
print("\nProcessing blocks with input field")
# Check if these blocks also have type and turn information
has_turn_info = any("turn" in block and "type" in block for block in conversation_blocks)
if has_turn_info:
print("Blocks have both input field and turn-based structure - using mixed format")
# Sort by turn if available, ensuring proper sequence
sorted_blocks = sorted(conversation_blocks, key=lambda b: b.get("turn", float('inf')))
for idx, block in enumerate(sorted_blocks):
if "input" in block and block["input"].strip():
# Determine voice based on type field or name
if "type" in block:
is_believer = block["type"] == "believer"
else:
is_believer = "Believer" in block.get("name", "") or block.get("name", "").lower().startswith("alloy")
voice_id = believer_voice_id if is_believer else skeptic_voice_id
speaker_type = "believer" if is_believer else "skeptic"
turn = block.get("turn", idx + 1)
print(f"\nProcessing {speaker_type} block with turn {turn} using voice {voice_id}")
audio_file = os.path.join(podcast_temp_dir, f"{speaker_type}_turn_{turn}_{idx}.mp3")
if self.generate_speech(block["input"], voice_id, audio_file):
audio_files.append(audio_file)
print(f"Generated audio for {speaker_type} turn {turn}")
else:
raise Exception(f"Failed to generate audio for {speaker_type} turn {turn}")
else:
# Old format - process blocks sequentially as they appear
print("Processing old format blocks sequentially")
for i, block in enumerate(conversation_blocks):
if "input" in block and block["input"].strip():
# Check for either "Believer" in name or if the name starts with "alloy"
is_believer = "Believer" in block.get("name", "") or block.get("name", "").lower().startswith("alloy")
voice_id = believer_voice_id if is_believer else skeptic_voice_id
speaker_type = "believer" if is_believer else "skeptic"
print(f"\nProcessing {speaker_type} block {i+1} with voice {voice_id}")
print(f"Block name: {block.get('name', '')}") # Debug logging
audio_file = os.path.join(podcast_temp_dir, f"part_{i+1}.mp3")
if self.generate_speech(block["input"], voice_id, audio_file):
audio_files.append(audio_file)
print(f"Generated audio for part {i+1}")
else:
raise Exception(f"Failed to generate audio for part {i+1}")
else:
raise Exception("Invalid conversation blocks format - no recognizable structure found")
if not audio_files:
raise Exception("No audio files were generated from the conversation blocks")
print(f"\nGenerated {len(audio_files)} audio files in total")
# Print the final order of audio files for verification
print("\nFinal audio file order before merging:")
for i, file in enumerate(audio_files):
print(f"{i+1}. {os.path.basename(file)}")
# Merge all audio files
final_audio = os.path.join(podcast_temp_dir, "final_podcast.mp3")
print(f"Merging to final audio: {final_audio}")
if not self.merge_audio_files(audio_files, final_audio):
raise Exception("Failed to merge audio files")
# Calculate audio duration using ffprobe
duration = 0
try:
cmd = [
'ffprobe',
'-v', 'error',
'-show_entries', 'format=duration',
'-of', 'default=noprint_wrappers=1:nokey=1',
final_audio
]
duration_result = subprocess.run(cmd, capture_output=True, text=True)
if duration_result.returncode == 0:
duration = float(duration_result.stdout.strip())
print(f"Audio duration: {duration} seconds")
else:
print(f"Failed to get audio duration: {duration_result.stderr}")
except Exception as e:
print(f"Error calculating duration: {str(e)}")
# Don't fail the entire process for duration calculation
podcast_doc = {
"topic": topic,
"research": research,
"conversation_blocks": conversation_blocks,
"audio_path": final_audio,
"created_at": datetime.utcnow(),
"believer_voice_id": believer_voice_id,
"skeptic_voice_id": skeptic_voice_id,
"user_id": user_id,
"duration": duration # Add duration to MongoDB document
}
result = await podcasts.insert_one(podcast_doc)
# Clean up individual audio files but keep the final one
for audio_file in audio_files:
if os.path.exists(audio_file):
os.remove(audio_file)
return {
"podcast_id": str(result.inserted_id),
"audio_path": final_audio,
"topic": topic,
"duration": duration # Return duration in the result
}
except Exception as e:
# Clean up the temp directory in case of error
if os.path.exists(podcast_temp_dir):
shutil.rmtree(podcast_temp_dir)
logger.exception(f"Error in podcast creation: {str(e)}")
return {
"error": str(e)
}
async def get_podcast(self, podcast_id: str) -> Dict:
"""Retrieve a podcast by ID."""
try:
from bson.objectid import ObjectId
podcast = await podcasts.find_one({"_id": ObjectId(podcast_id)})
if podcast:
podcast["_id"] = str(podcast["_id"])
return podcast
return {"error": "Podcast not found"}
except Exception as e:
return {"error": str(e)}