File size: 18,685 Bytes
fd52f31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
import requests
import json
import os
import shutil
import subprocess
from datetime import datetime
from decouple import config
from motor.motor_asyncio import AsyncIOMotorClient
from typing import Dict, List
import logging
from fastapi import HTTPException, status

logger = logging.getLogger(__name__)

class Settings:
    MONGODB_URL = config('MONGODB_URL')
    SECRET_KEY = config('SECRET_KEY')
    OPENAI_API_KEY = config('OPENAI_API_KEY')
    # Other settings...

settings = Settings()

client = AsyncIOMotorClient(settings.MONGODB_URL)
db = client.podcraft
podcasts = db.podcasts

class PodcastManager:
    def __init__(self):
        self.tts_url = "https://api.openai.com/v1/audio/speech"
        self.headers = {
            "Authorization": f"Bearer {settings.OPENAI_API_KEY}",
            "Content-Type": "application/json"
        }
        # Create absolute path for temp directory
        self.temp_dir = os.path.abspath("temp_audio")
        os.makedirs(self.temp_dir, exist_ok=True)
        
        # Define allowed voices
        self.allowed_voices = ["alloy", "echo", "fable", "onyx", "nova", "shimmer", "ash", "sage", "coral"]

    def generate_speech(self, text: str, voice_id: str, filename: str) -> bool:
        """Generate speech using OpenAI's TTS API."""
        try:
            # Debug logging for voice selection
            print(f"\n=== TTS Generation Details ===")
            print(f"File: {filename}")
            print(f"Voice ID (original): {voice_id}")
            print(f"Voice ID (lowercase): {voice_id.lower()}")
            print(f"Allowed voices: {self.allowed_voices}")
            
            # Validate and normalize voice_id
            voice = voice_id.lower().strip()
            if voice not in self.allowed_voices:
                print(f"Warning: Invalid voice ID: {voice_id}. Using default voice 'alloy'")
                voice = "alloy"
            
            print(f"Final voice selection: {voice}")

            # Ensure the output directory exists
            output_dir = os.path.dirname(filename)
            os.makedirs(output_dir, exist_ok=True)

            payload = {
                "model": "tts-1",
                "input": text,
                "voice": voice
            }
            
            print(f"TTS API payload: {json.dumps(payload, indent=2)}")
            print(f"Request headers: {json.dumps({k: '***' if k == 'Authorization' else v for k, v in self.headers.items()}, indent=2)}")

            response = requests.post(self.tts_url, json=payload, headers=self.headers)
            if response.status_code != 200:
                print(f"API error response: {response.status_code} - {response.text}")
                return False
                
            # Write the audio content to the file
            with open(filename, "wb") as f:
                f.write(response.content)
            
            print(f"Successfully generated speech file: {filename}")
            print(f"File size: {os.path.getsize(filename)} bytes")
            
            # Verify the file exists and has content
            if not os.path.exists(filename) or os.path.getsize(filename) == 0:
                print(f"Error: Generated file is empty or does not exist: {filename}")
                return False
                
            return True
        except Exception as e:
            print(f"Error generating speech: {str(e)}")
            logger.exception(f"Error generating speech: {str(e)}")
            return False

    def merge_audio_files(self, audio_files: List[str], output_file: str) -> bool:
        """Merge multiple audio files into one using ffmpeg."""
        try:
            # Ensure output directory exists
            output_dir = os.path.dirname(os.path.abspath(output_file))
            os.makedirs(output_dir, exist_ok=True)
            
            if not audio_files:
                print("No audio files to merge")
                return False

            # Verify all input files exist
            for audio_file in audio_files:
                if not os.path.exists(audio_file):
                    print(f"Audio file does not exist: {audio_file}")
                    return False

            # Ensure all paths are absolute
            output_file = os.path.abspath(output_file)
            output_dir = os.path.dirname(output_file)
            os.makedirs(output_dir, exist_ok=True)
            
            # Create temporary files in the same directory
            list_file = os.path.join(output_dir, "files.txt")
            silence_file = os.path.join(output_dir, "silence.mp3")
            
            print(f"Output directory: {output_dir}")
            print(f"List file: {list_file}")
            print(f"Silence file: {silence_file}")
            
            # Generate shorter silence file (0.3 seconds instead of 1 second)
            silence_result = subprocess.run([
                'ffmpeg', '-f', 'lavfi', '-i', 'anullsrc=r=44100:cl=mono', 
                '-t', '0.3', '-q:a', '9', '-acodec', 'libmp3lame', silence_file
            ], capture_output=True, text=True)

            if silence_result.returncode != 0:
                print(f"Error generating silence file: {silence_result.stderr}")
                return False

            if not os.path.exists(silence_file):
                print("Failed to create silence file")
                return False

            # IMPORTANT: The order here determines the final audio order
            print("\nGenerating files list in exact provided order:")
            try:
                with open(list_file, "w", encoding='utf-8') as f:
                    for i, audio_file in enumerate(audio_files):
                        abs_audio_path = os.path.abspath(audio_file)
                        print(f"{i+1}. Adding audio file: {os.path.basename(abs_audio_path)}")
                        # Use forward slashes for ffmpeg compatibility
                        abs_audio_path = abs_audio_path.replace('\\', '/')
                        silence_path = silence_file.replace('\\', '/')
                        f.write(f"file '{abs_audio_path}'\n")
                        # Add a shorter silence after each audio segment (except the last one)
                        if i < len(audio_files) - 1:
                            f.write(f"file '{silence_path}'\n")
            except Exception as e:
                print(f"Error writing list file: {str(e)}")
                return False

            if not os.path.exists(list_file):
                print("Failed to create list file")
                return False

            # Print the contents of the list file for debugging
            print("\nContents of files.txt:")
            with open(list_file, 'r', encoding='utf-8') as f:
                print(f.read())

            # Merge all files using the concat demuxer with optimized settings
            try:
                # Use concat demuxer with additional parameters for better playback
                result = subprocess.run(
                    ['ffmpeg', '-f', 'concat', '-safe', '0', '-i', list_file,
                     '-c:a', 'libmp3lame', '-q:a', '4', '-ar', '44100',
                     output_file],
                    capture_output=True,
                    text=True,
                    check=True
                )
            except subprocess.CalledProcessError as e:
                logger.error(f"FFmpeg command failed: {e.stderr}")
                return False
            
            # Verify the output file was created
            if not os.path.exists(output_file):
                print("Failed to create output file")
                return False

            print(f"Successfully created merged audio file: {output_file}")
            return True
        except Exception as e:
            print(f"Error merging audio files: {str(e)}")
            return False

    async def create_podcast(
        self,
        topic: str,
        research: str,
        conversation_blocks: List[Dict],
        believer_voice_id: str,
        skeptic_voice_id: str,
        user_id: str = None
    ) -> Dict:
        """Create a podcast by converting text to speech and storing the results."""
        podcast_temp_dir = None
        try:
            # Debug logging for voice IDs
            print(f"\nPodcast Creation - Voice Configuration:")
            print(f"Believer Voice ID: {believer_voice_id}")
            print(f"Skeptic Voice ID: {skeptic_voice_id}")
            
            # Create a unique directory with absolute path
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            podcast_temp_dir = os.path.abspath(os.path.join(self.temp_dir, timestamp))
            os.makedirs(podcast_temp_dir, exist_ok=True)
            
            print(f"Created temp directory: {podcast_temp_dir}")
            print(f"Processing conversation blocks: {json.dumps(conversation_blocks, indent=2)}")
            
            audio_files = []
            
            # Process the blocks differently based on format:
            # 1. New turn-based format with "type" and "turn" fields
            # 2. Blocks with "input" field but no turn-based structure (old format)
            # 3. Blocks with both "input" field and turn-based structure (mixed format)
            
            # First check: New format blocks with type and turn
            if any("type" in block and "turn" in block and "content" in block for block in conversation_blocks):
                print("\nProcessing new format blocks with type, turn, and content fields")
                
                # Process conversation blocks in the EXACT order they were provided
                # This ensures proper alternation between speakers as specified by the caller
                
                for idx, block in enumerate(conversation_blocks):
                    if "type" in block and "content" in block and "turn" in block:
                        turn = block.get("turn", 0)
                        agent_type = block.get("type", "")
                        content = block.get("content", "")
                        
                        if not content.strip():  # Skip empty content
                            continue
                            
                        # Use the correct voice based on agent type
                        voice_id = believer_voice_id if agent_type == "believer" else skeptic_voice_id
                        file_prefix = "believer" if agent_type == "believer" else "skeptic"
                        
                        # Create a unique filename with turn number
                        audio_file = os.path.join(podcast_temp_dir, f"{file_prefix}_turn_{turn}_{idx}.mp3")
                        
                        print(f"\nProcessing {agent_type} turn {turn} (index {idx}) with voice {voice_id}")
                        print(f"Content preview: {content[:100]}...")
                        
                        if self.generate_speech(content, voice_id, audio_file):
                            # Add to our audio files list IN THE ORIGINAL ORDER
                            audio_files.append(audio_file)
                            print(f"Generated {agent_type} audio for turn {turn}, added to position {len(audio_files)}")
                        else:
                            raise Exception(f"Failed to generate audio for {agent_type} turn {turn}")
                
            # Second check: Blocks with input field and possibly turn information
            elif any("input" in block for block in conversation_blocks):
                print("\nProcessing blocks with input field")
                
                # Check if these blocks also have type and turn information
                has_turn_info = any("turn" in block and "type" in block for block in conversation_blocks)
                
                if has_turn_info:
                    print("Blocks have both input field and turn-based structure - using mixed format")
                    # Sort by turn if available, ensuring proper sequence
                    sorted_blocks = sorted(conversation_blocks, key=lambda b: b.get("turn", float('inf')))
                    
                    for idx, block in enumerate(sorted_blocks):
                        if "input" in block and block["input"].strip():
                            # Determine voice based on type field or name
                            if "type" in block:
                                is_believer = block["type"] == "believer"
                            else:
                                is_believer = "Believer" in block.get("name", "") or block.get("name", "").lower().startswith("alloy")
                            
                            voice_id = believer_voice_id if is_believer else skeptic_voice_id
                            speaker_type = "believer" if is_believer else "skeptic"
                            turn = block.get("turn", idx + 1)
                            
                            print(f"\nProcessing {speaker_type} block with turn {turn} using voice {voice_id}")
                            audio_file = os.path.join(podcast_temp_dir, f"{speaker_type}_turn_{turn}_{idx}.mp3")
                            
                            if self.generate_speech(block["input"], voice_id, audio_file):
                                audio_files.append(audio_file)
                                print(f"Generated audio for {speaker_type} turn {turn}")
                            else:
                                raise Exception(f"Failed to generate audio for {speaker_type} turn {turn}")
                else:
                    # Old format - process blocks sequentially as they appear
                    print("Processing old format blocks sequentially")
                    for i, block in enumerate(conversation_blocks):
                        if "input" in block and block["input"].strip():
                            # Check for either "Believer" in name or if the name starts with "alloy"
                            is_believer = "Believer" in block.get("name", "") or block.get("name", "").lower().startswith("alloy")
                            voice_id = believer_voice_id if is_believer else skeptic_voice_id
                            speaker_type = "believer" if is_believer else "skeptic"
                            
                            print(f"\nProcessing {speaker_type} block {i+1} with voice {voice_id}")
                            print(f"Block name: {block.get('name', '')}")  # Debug logging
                            
                            audio_file = os.path.join(podcast_temp_dir, f"part_{i+1}.mp3")
                            if self.generate_speech(block["input"], voice_id, audio_file):
                                audio_files.append(audio_file)
                                print(f"Generated audio for part {i+1}")
                            else:
                                raise Exception(f"Failed to generate audio for part {i+1}")
            else:
                raise Exception("Invalid conversation blocks format - no recognizable structure found")

            if not audio_files:
                raise Exception("No audio files were generated from the conversation blocks")

            print(f"\nGenerated {len(audio_files)} audio files in total")
            
            # Print the final order of audio files for verification
            print("\nFinal audio file order before merging:")
            for i, file in enumerate(audio_files):
                print(f"{i+1}. {os.path.basename(file)}")
            
            # Merge all audio files
            final_audio = os.path.join(podcast_temp_dir, "final_podcast.mp3")
            print(f"Merging to final audio: {final_audio}")
            
            if not self.merge_audio_files(audio_files, final_audio):
                raise Exception("Failed to merge audio files")
                
            # Calculate audio duration using ffprobe
            duration = 0
            try:
                cmd = [
                    'ffprobe', 
                    '-v', 'error', 
                    '-show_entries', 'format=duration', 
                    '-of', 'default=noprint_wrappers=1:nokey=1', 
                    final_audio
                ]
                duration_result = subprocess.run(cmd, capture_output=True, text=True)
                if duration_result.returncode == 0:
                    duration = float(duration_result.stdout.strip())
                    print(f"Audio duration: {duration} seconds")
                else:
                    print(f"Failed to get audio duration: {duration_result.stderr}")
            except Exception as e:
                print(f"Error calculating duration: {str(e)}")
                # Don't fail the entire process for duration calculation

            podcast_doc = {
                "topic": topic,
                "research": research,
                "conversation_blocks": conversation_blocks,
                "audio_path": final_audio,
                "created_at": datetime.utcnow(),
                "believer_voice_id": believer_voice_id,
                "skeptic_voice_id": skeptic_voice_id,
                "user_id": user_id,
                "duration": duration  # Add duration to MongoDB document
            }

            result = await podcasts.insert_one(podcast_doc)
            
            # Clean up individual audio files but keep the final one
            for audio_file in audio_files:
                if os.path.exists(audio_file):
                    os.remove(audio_file)

            return {
                "podcast_id": str(result.inserted_id),
                "audio_path": final_audio,
                "topic": topic,
                "duration": duration  # Return duration in the result
            }

        except Exception as e:
            # Clean up the temp directory in case of error
            if os.path.exists(podcast_temp_dir):
                shutil.rmtree(podcast_temp_dir)
            logger.exception(f"Error in podcast creation: {str(e)}")
            return {
                "error": str(e)
            }

    async def get_podcast(self, podcast_id: str) -> Dict:
        """Retrieve a podcast by ID."""
        try:
            from bson.objectid import ObjectId
            podcast = await podcasts.find_one({"_id": ObjectId(podcast_id)})
            if podcast:
                podcast["_id"] = str(podcast["_id"])
                return podcast
            return {"error": "Podcast not found"}
        except Exception as e:
            return {"error": str(e)}