Spaces:

saq1b
/

podcastgen

Running

App Files Files Community

saq1b commited on 15 days ago

Commit

43f9ba7

verified ·

1 Parent(s): c1a4c87

Update app.py

Browse files

Files changed (1) hide show

app.py +333 -198

app.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import gradio as gr
-import logging
 from pydub import AudioSegment
-from google import genai  # Using the new Gemini API client
-from google.genai import types  # For inline file parts
 import json
 import uuid
 import io
@@ -10,19 +9,13 @@ import edge_tts
 import asyncio
 import os
 import time
-import aiofiles
-# Set up logging
-logging.basicConfig(level=logging.INFO)
-# Maximum file size allowed: 20 MB
-MAX_FILE_SIZE = 20 * 1024 * 1024
 class PodcastGenerator:
     def __init__(self):
         pass
-    async def generate_script(self, prompt: str, language: str, api_key: str, file_data=None, file_mime_type=None) -> dict:
         example = """
 {
     "topic": "AGI",
@@ -34,8 +27,183 @@ class PodcastGenerator:
         {
             "speaker": 1,
             "line": "Yeah, it's definitely having a moment, isn't it?"
         }
-        // ... (rest of the example)
     ]
 }
         """
@@ -52,231 +220,198 @@ You are a professional podcast generator. Your task is to generate a professiona
 - Do not use names for the speakers.
 - The podcast should be interesting, lively, and engaging, and hook the listener from the start.
 - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
-- The script must be in JSON format.
 Follow this example structure:
 {example}
-"""
         user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
-        # Initialize the Gemini API client with the provided API key.
         client = genai.Client(api_key=api_key)
-        contents = []
-        if file_data is not None:
-            try:
-                # Use inline file data directly without uploading.
-                contents.append(types.Part.from_bytes(data=file_data, mime_type=file_mime_type))
-            except Exception as e:
-                logging.error("Error preparing file part: %s", e)
-                raise gr.Error(f"Error processing file data: {e}")
-        contents.append(user_prompt)
-        config = {
-            "system_instruction": system_prompt,
-            "temperature": 1,
-            "max_output_tokens": 8192,
-            "response_mime_type": "application/json",
-        }
-        try:
-            response = await client.aio.models.generate_content(
-                model="gemini-2.0-flash",
-                contents=contents,
-                config=config
-            )
-        except Exception as e:
-            logging.error("API call failed: %s", e)
-            if "API key not valid" in str(e):
-                raise gr.Error("Invalid API key. Please provide a valid Gemini API key.")
-            elif "rate limit" in str(e).lower():
-                raise gr.Error("Rate limit exceeded for the API key. Please try again later or provide your own Gemini API key.")
-            else:
-                raise gr.Error(f"Failed to generate podcast script: {e}")
-        try:
-            result = json.loads(response.text)
-        except json.JSONDecodeError as e:
-            logging.error("JSON parsing failed: %s", e)
-            raise gr.Error(f"Response is not valid JSON: {e}")
-        logging.info("Successfully generated script: %s", result)
-        return result
     async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
         voice = speaker1 if speaker == 1 else speaker2
-        try:
-            speech = edge_tts.Communicate(text, voice)
-        except Exception as e:
-            logging.error("TTS initialization failed: %s", e)
-            raise gr.Error(f"Text-to-Speech initialization error: {e}")
         temp_filename = f"temp_{uuid.uuid4()}.wav"
         try:
             await speech.save(temp_filename)
             return temp_filename
         except Exception as e:
-            logging.error("TTS generation failed: %s", e)
             if os.path.exists(temp_filename):
                 os.remove(temp_filename)
-            raise gr.Error(f"Failed to generate speech for text: {e}")
     async def combine_audio_files(self, audio_files: list) -> str:
-        try:
-            combined_audio = AudioSegment.empty()
-            for audio_file in audio_files:
-                try:
-                    combined_audio += AudioSegment.from_file(audio_file)
-                except Exception as inner_e:
-                    logging.error("Error processing audio file %s: %s", audio_file, inner_e)
-                    raise gr.Error(f"Error processing audio file: {inner_e}")
-                finally:
-                    if os.path.exists(audio_file):
-                        os.remove(audio_file)  # Clean up temporary file
-            output_filename = f"output_{uuid.uuid4()}.wav"
-            combined_audio.export(output_filename, format="wav")
-            return output_filename
-        except Exception as e:
-            logging.error("Failed to combine audio files: %s", e)
-            raise gr.Error(f"Failed to combine audio files: {e}")
-    async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_data=None, file_mime_type=None) -> str:
-        try:
-            gr.Info("Generating podcast script...")
-            start_time = time.time()
-            podcast_json = await self.generate_script(input_text, language, api_key, file_data, file_mime_type)
-            end_time = time.time()
-            gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!")
-        except Exception as e:
-            logging.error("Script generation error: %s", e)
-            raise gr.Error(f"Error generating podcast script: {e}")
-        try:
-            gr.Info("Generating podcast audio files...")
-            start_time = time.time()
-            audio_files = await asyncio.gather(*[
-                self.tts_generate(item['line'], item['speaker'], speaker1, speaker2)
-                for item in podcast_json.get('podcast', [])
-            ])
-            end_time = time.time()
-            gr.Info(f"Successfully generated podcast audio files in {(end_time - start_time):.2f} seconds!")
-        except Exception as e:
-            logging.error("TTS generation error: %s", e)
-            raise gr.Error(f"Error generating audio files: {e}")
-        try:
-            combined_audio = await self.combine_audio_files(audio_files)
-            return combined_audio
-        except Exception as e:
-            logging.error("Audio combining error: %s", e)
-            raise gr.Error(f"Error combining audio files: {e}")
 async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "") -> str:
-    try:
-        gr.Info("Starting podcast generation...")
-        start_time = time.time()
-        voice_names = {
-            "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
-            "Ava - English (United States)": "en-US-AvaMultilingualNeural",
-            "Brian - English (United States)": "en-US-BrianMultilingualNeural",
-            "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
-            "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
-            "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
-            "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
-            "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
-        }
-        speaker1 = voice_names.get(speaker1, speaker1)
-        speaker2 = voice_names.get(speaker2, speaker2)
-        file_data = None
-        file_mime_type = None
-        if input_file:
-            ext = os.path.splitext(input_file.name)[1].lower()
-            if ext not in ['.pdf', '.txt']:
-                raise gr.Error("Unsupported file type. Only PDF and TXT files are allowed.")
-            try:
-                async with aiofiles.open(input_file.name, 'rb') as f:
-                    file_data = await f.read()
-            except Exception as e:
-                logging.error("Error reading file: %s", e)
-                raise gr.Error(f"Error reading file: {e}")
-            if len(file_data) > MAX_FILE_SIZE:
-                raise gr.Error("File size exceeds 20MB limit.")
-            file_mime_type = 'application/pdf' if ext == '.pdf' else 'text/plain'
-        if not api_key:
-            api_key = os.getenv("GENAI_API_KEY")
-            if not api_key:
-                raise gr.Error("No API key provided and none found in the environment.")
         podcast_generator = PodcastGenerator()
-        podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key, file_data, file_mime_type)
-        end_time = time.time()
-        gr.Info(f"Successfully generated podcast in {(end_time - start_time):.2f} seconds!")
-        return podcast
-    except Exception as e:
-        logging.error("Process input error: %s", e)
-        raise gr.Error(f"Error in processing input: {e}")
-# Note the updated output: forcing type="filepath" to avoid the schema bug.
 iface = gr.Interface(
     fn=process_input,
     inputs=[
         gr.Textbox(label="Input Text"),
         gr.File(label="Or Upload a PDF or TXT file"),
-        gr.Dropdown(
-            label="Language",
-            choices=[
-                "Auto Detect",
-                "Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani",
-                "Bahasa Indonesian", "Bangla", "Basque", "Bengali", "Bosnian", "Bulgarian",
-                "Burmese", "Catalan", "Chinese Cantonese", "Chinese Mandarin",
-                "Chinese Taiwanese", "Croatian", "Czech", "Danish", "Dutch", "English",
-                "Estonian", "Filipino", "Finnish", "French", "Galician", "Georgian",
-                "German", "Greek", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Irish",
-                "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean",
-                "Lao", "Latvian", "Lithuanian", "Macedonian", "Malay", "Malayalam",
-                "Maltese", "Mongolian", "Nepali", "Norwegian Bokmål", "Pashto", "Persian",
-                "Polish", "Portuguese", "Romanian", "Russian", "Serbian", "Sinhala",
-                "Slovak", "Slovene", "Somali", "Spanish", "Sundanese", "Swahili",
-                "Swedish", "Tamil", "Telugu", "Thai", "Turkish", "Ukrainian", "Urdu",
-                "Uzbek", "Vietnamese", "Welsh", "Zulu"
-            ],
-            value="Auto Detect"
-        ),
-        gr.Dropdown(
-            label="Speaker 1 Voice",
-            choices=[
-                "Andrew - English (United States)",
-                "Ava - English (United States)",
-                "Brian - English (United States)",
-                "Emma - English (United States)",
-                "Florian - German (Germany)",
-                "Seraphina - German (Germany)",
-                "Remy - French (France)",
-                "Vivienne - French (France)"
-            ],
-            value="Andrew - English (United States)"
-        ),
-        gr.Dropdown(
-            label="Speaker 2 Voice",
-            choices=[
-                "Andrew - English (United States)",
-                "Ava - English (United States)",
-                "Brian - English (United States)",
-                "Emma - English (United States)",
-                "Florian - German (Germany)",
-                "Seraphina - German (Germany)",
-                "Remy - French (France)",
-                "Vivienne - French (France)"
-            ],
-            value="Ava - English (United States)"
-        ),
         gr.Textbox(label="Your Gemini API Key (Optional) - In case you are getting rate limited"),
     ],
     outputs=[
-        gr.Audio(label="Generated Podcast Audio", type="filepath")
     ],
     title="PodcastGen 🎙️",
     description="Generate a 2-speaker podcast from text input or documents!",

 import gradio as gr
 from pydub import AudioSegment
+from google import genai
+from google.genai import types
 import json
 import uuid
 import io
 import asyncio
 import os
 import time
+import pathlib
 class PodcastGenerator:
     def __init__(self):
         pass
+    async def generate_script(self, prompt: str, language: str, api_key: str) -> dict:
         example = """
 {
     "topic": "AGI",
         {
             "speaker": 1,
             "line": "Yeah, it's definitely having a moment, isn't it?"
+        },
+        {
+            "speaker": 2,
+            "line": "It is and for good reason, right? I mean, you've been digging into this stuff, listening to the podcasts and everything. What really stood out to you? What got you hooked?"
+        },
+        {
+            "speaker": 1,
+            "line": "Honestly, it's the sheer scale of what AGI could do. We're talking about potentially reshaping well everything."
+        },
+        {
+            "speaker": 2,
+            "line": "No kidding, but let's be real. Sometimes it feels like every other headline is either hyping AGI up as this technological utopia or painting it as our inevitable robot overlords."
+        },
+        {
+            "speaker": 1,
+            "line": "It's easy to get lost in the noise, for sure."
+        },
+        {
+            "speaker": 2,
+            "line": "Exactly. So how about we try to cut through some of that, shall we?"
+        },
+        {
+            "speaker": 1,
+            "line": "Sounds like a plan."
+        },
+        {
+            "speaker": 2,
+            "line": "Okay, so first things first, AGI, what is it really? And I don't just mean some dictionary definition, we're talking about something way bigger than just a super smart computer, right?"
+        },
+        {
+            "speaker": 1,
+            "line": "Right, it's not just about more processing power or better algorithms, it's about a fundamental shift in how we think about intelligence itself."
+        },
+        {
+            "speaker": 2,
+            "line": "So like, instead of programming a machine for a specific task, we're talking about creating something that can learn and adapt like we do."
+        },
+        {
+            "speaker": 1,
+            "line": "Exactly, think of it this way: Right now, we've got AI that can beat a grandmaster at chess but ask that same AI to, say, write a poem or compose a symphony. No chance."
+        },
+        {
+            "speaker": 2,
+            "line": "Okay, I see. So, AGI is about bridging that gap, creating something that can move between those different realms of knowledge seamlessly."
+        },
+        {
+            "speaker": 1,
+            "line": "Precisely. It's about replicating that uniquely human ability to learn something new and apply that knowledge in completely different contexts and that's a tall order, let me tell you."
+        },
+        {
+            "speaker": 2,
+            "line": "I bet. I mean, think about how much we still don't even understand about our own brains."
+        },
+        {
+            "speaker": 1,
+            "line": "That's exactly it. We're essentially trying to reverse-engineer something we don't fully comprehend."
+        },
+        {
+            "speaker": 2,
+            "line": "And how are researchers even approaching that? What are some of the big ideas out there?"
+        },
+        {
+            "speaker": 1,
+            "line": "Well, there are a few different schools of thought. One is this idea of neuromorphic computing where they're literally trying to build computer chips that mimic the structure and function of the human brain."
+        },
+        {
+            "speaker": 2,
+            "line": "Wow, so like actually replicating the physical architecture of the brain. That's wild."
+        },
+        {
+            "speaker": 1,
+            "line": "It's pretty mind-blowing stuff and then you've got folks working on something called whole brain emulation."
+        },
+        {
+            "speaker": 2,
+            "line": "Okay, and what's that all about?"
+        },
+        {
+            "speaker": 1,
+            "line": "The basic idea there is to create a complete digital copy of a human brain down to the last neuron and synapse and run it on a sufficiently powerful computer simulation."
+        },
+        {
+            "speaker": 2,
+            "line": "Hold on, a digital copy of an entire brain, that sounds like something straight out of science fiction."
+        },
+        {
+            "speaker": 1,
+            "line": "It does, doesn't it? But it gives you an idea of the kind of ambition we're talking about here and the truth is we're still a long way off from truly achieving AGI, no matter which approach you look at."
+        },
+        {
+            "speaker": 2,
+            "line": "That makes sense but it's still exciting to think about the possibilities, even if they're a ways off."
+        },
+        {
+            "speaker": 1,
+            "line": "Absolutely and those possibilities are what really get people fired up about AGI, right? Yeah."
+        },
+        {
+            "speaker": 2,
+            "line": "For sure. In fact, I remember you mentioning something in that podcast about AGI's potential to revolutionize scientific research. Something about supercharging breakthroughs."
+        },
+        {
+            "speaker": 1,
+            "line": "Oh, absolutely. Imagine an AI that doesn't just crunch numbers but actually understands scientific data the way a human researcher does. We're talking about potential breakthroughs in everything from medicine and healthcare to material science and climate change."
+        },
+        {
+            "speaker": 2,
+            "line": "It's like giving scientists this incredibly powerful new tool to tackle some of the biggest challenges we face."
+        },
+        {
+            "speaker": 1,
+            "line": "Exactly, it could be a total game changer."
+        },
+        {
+            "speaker": 2,
+            "line": "Okay, but let's be real, every coin has two sides. What about the potential downsides of AGI? Because it can't all be sunshine and roses, right?"
+        },
+        {
+            "speaker": 1,
+            "line": "Right, there are definitely valid concerns. Probably the biggest one is the impact on the job market. As AGI gets more sophisticated, there's a real chance it could automate a lot of jobs that are currently done by humans."
+        },
+        {
+            "speaker": 2,
+            "line": "So we're not just talking about robots taking over factories but potentially things like, what, legal work, analysis, even creative fields?"
+        },
+        {
+            "speaker": 1,
+            "line": "Potentially, yes. And that raises a whole host of questions about what happens to those workers, how we retrain them, how we ensure that the benefits of AGI are shared equitably."
+        },
+        {
+            "speaker": 2,
+            "line": "Right, because it's not just about the technology itself, but how we choose to integrate it into society."
+        },
+        {
+            "speaker": 1,
+            "line": "Absolutely. We need to be having these conversations now about ethics, about regulation, about how to make sure AGI is developed and deployed responsibly."
+        },
+        {
+            "speaker": 2,
+            "line": "So it's less about preventing some kind of sci-fi robot apocalypse and more about making sure we're steering this technology in the right direction from the get-go."
+        },
+        {
+            "speaker": 1,
+            "line": "Exactly, AGI has the potential to be incredibly beneficial, but it's not going to magically solve all our problems. It's on us to make sure we're using it for good."
+        },
+        {
+            "speaker": 2,
+            "line": "It's like you said earlier, it's about shaping the future of intelligence."
+        },
+        {
+            "speaker": 1,
+            "line": "I like that. It really is."
+        },
+        {
+            "speaker": 2,
+            "line": "And honestly, that's a responsibility that extends beyond just the researchers and the policymakers."
+        },
+        {
+            "speaker": 1,
+            "line": "100%"
+        },
+        {
+            "speaker": 2,
+            "line": "So to everyone listening out there I'll leave you with this. As AGI continues to develop, what role do you want to play in shaping its future?"
+        },
+        {
+            "speaker": 1,
+            "line": "That's a question worth pondering."
+        },
+        {
+            "speaker": 2,
+            "line": "It certainly is and on that note, we'll wrap up this deep dive. Thanks for listening, everyone."
+        },
+        {
+            "speaker": 1,
+            "line": "Peace."
         }
     ]
 }
         """
 - Do not use names for the speakers.
 - The podcast should be interesting, lively, and engaging, and hook the listener from the start.
 - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast.
 Follow this example structure:
 {example}
+        """
         user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
         client = genai.Client(api_key=api_key)
+        config = types.GenerateContentConfig(
+            system_instruction=system_prompt,
+            temperature=1,
+            max_output_tokens=8192,
+            response_mime_type="application/json"
+        )
+        response = await client.aio.models.generate_content(
+            model="gemini-2.0-flash",
+            contents=[user_prompt],
+            config=config
+        )
+        print(f"Generated podcast script:\n{response.text}")
+        return json.loads(response.text)
     async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
         voice = speaker1 if speaker == 1 else speaker2
+        speech = edge_tts.Communicate(text, voice)
         temp_filename = f"temp_{uuid.uuid4()}.wav"
         try:
             await speech.save(temp_filename)
             return temp_filename
         except Exception as e:
             if os.path.exists(temp_filename):
                 os.remove(temp_filename)
+            raise e
     async def combine_audio_files(self, audio_files: list) -> str:
+        combined_audio = AudioSegment.empty()
+        for audio_file in audio_files:
+            combined_audio += AudioSegment.from_file(audio_file)
+            os.remove(audio_file)  # Clean up temporary files
+        output_filename = f"output_{uuid.uuid4()}.wav"
+        combined_audio.export(output_filename, format="wav")
+        return output_filename
+    async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str) -> dict:
+        gr.Info("Generating podcast script...")
+        start_time = time.time()
+        podcast_json = await self.generate_script(input_text, language, api_key)
+        end_time = time.time()
+        gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!")
+        gr.Info("Generating podcast audio files...")
+        start_time = time.time()
+        audio_files = await asyncio.gather(*[self.tts_generate(item['line'], item['speaker'], speaker1, speaker2)
+                                               for item in podcast_json['podcast']])
+        end_time = time.time()
+        gr.Info(f"Successfully generated podcast audio files in {(end_time - start_time):.2f} seconds!")
+        combined_audio = await self.combine_audio_files(audio_files)
+        return combined_audio
+# Note: The manual file extraction class has been removed because we now use Gemini's File API.
 async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "") -> str:
+    gr.Info("Starting podcast generation...")
+    start_time = time.time()
+    voice_names = {
+        "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
+        "Ava - English (United States)": "en-US-AvaMultilingualNeural",
+        "Brian - English (United States)": "en-US-BrianMultilingualNeural",
+        "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
+        "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
+        "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
+        "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
+        "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
+    }
+    speaker1 = voice_names[speaker1]
+    speaker2 = voice_names[speaker2]
+    if not api_key:
+        api_key = os.getenv("GENAI_API_KEY")
+    client = genai.Client(api_key=api_key)
+    if input_file:
+        # Only allow PDF and TXT files.
+        _, file_extension = os.path.splitext(input_file.name)
+        if file_extension.lower() not in ['.pdf', '.txt']:
+            raise gr.Error(f"Unsupported file type: {file_extension}. Only PDF and TXT files are allowed.")
+        # Check file size is under 20GB.
+        max_size = 20 * 1024 * 1024 * 1024  # 20GB in bytes
+        if os.path.getsize(input_file.name) > max_size:
+            raise gr.Error("File size exceeds the 20GB limit.")
+        # Upload the file using the Gemini File API.
+        uploaded_file = client.files.upload(input_file.name)
+        if language == "Auto Detect":
+            language_instruction = "- The podcast MUST be in the same language as the document."
+        else:
+            language_instruction = f"- The podcast MUST be in {language} language"
+        system_prompt = f"""
+You are a professional podcast generator. Your task is to generate a professional podcast script based on the contents of the document.
+{language_instruction}
+- The podcast should have 2 speakers.
+- The podcast should be long.
+- Do not use names for the speakers.
+- The podcast should be interesting, lively, and engaging, and hook the listener from the start.
+- The input document might be disorganized or unformatted. Ignore any formatting inconsistencies or irrelevant details; distill the essential points, key definitions, and intriguing facts suitable for discussion.
+        """
+        user_prompt = "Please generate a podcast script based on the document."
+        config = types.GenerateContentConfig(
+            system_instruction=system_prompt,
+            temperature=1,
+            max_output_tokens=8192,
+            response_mime_type="application/json"
+        )
+        response = await client.aio.models.generate_content(
+            model="gemini-2.0-flash",
+            contents=[uploaded_file, user_prompt],
+            config=config
+        )
+        podcast_json = json.loads(response.text)
+    else:
+        # Use provided text input.
         podcast_generator = PodcastGenerator()
+        podcast_json = await podcast_generator.generate_script(input_text, language, api_key)
+    gr.Info("Generating podcast audio files...")
+    start_time_audio = time.time()
+    podcast_generator = PodcastGenerator()
+    audio_files = await asyncio.gather(*[podcast_generator.tts_generate(item['line'], item['speaker'], speaker1, speaker2)
+                                           for item in podcast_json['podcast']])
+    end_time_audio = time.time()
+    gr.Info(f"Successfully generated podcast audio files in {(end_time_audio - start_time_audio):.2f} seconds!")
+    combined_audio = await podcast_generator.combine_audio_files(audio_files)
+    end_time = time.time()
+    gr.Info(f"Successfully generated podcast in {(end_time - start_time):.2f} seconds!")
+    return combined_audio
 iface = gr.Interface(
     fn=process_input,
     inputs=[
         gr.Textbox(label="Input Text"),
         gr.File(label="Or Upload a PDF or TXT file"),
+        gr.Dropdown(label="Language", choices=[
+            "Auto Detect",
+            "Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani",
+            "Bahasa Indonesian", "Bangla", "Basque", "Bengali", "Bosnian", "Bulgarian",
+            "Burmese", "Catalan", "Chinese Cantonese", "Chinese Mandarin",
+            "Chinese Taiwanese", "Croatian", "Czech", "Danish", "Dutch", "English",
+            "Estonian", "Filipino", "Finnish", "French", "Galician", "Georgian",
+            "German", "Greek", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Irish",
+            "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean",
+            "Lao", "Latvian", "Lithuanian", "Macedonian", "Malay", "Malayalam",
+            "Maltese", "Mongolian", "Nepali", "Norwegian Bokmål", "Pashto", "Persian",
+            "Polish", "Portuguese", "Romanian", "Russian", "Serbian", "Sinhala",
+            "Slovak", "Slovene", "Somali", "Spanish", "Sundanese", "Swahili",
+            "Swedish", "Tamil", "Telugu", "Thai", "Turkish", "Ukrainian", "Urdu",
+            "Uzbek", "Vietnamese", "Welsh", "Zulu"
+        ], value="Auto Detect"),
+        gr.Dropdown(label="Speaker 1 Voice", choices=[
+            "Andrew - English (United States)",
+            "Ava - English (United States)",
+            "Brian - English (United States)",
+            "Emma - English (United States)",
+            "Florian - German (Germany)",
+            "Seraphina - German (Germany)",
+            "Remy - French (France)",
+            "Vivienne - French (France)"
+        ], value="Andrew - English (United States)"),
+        gr.Dropdown(label="Speaker 2 Voice", choices=[
+            "Andrew - English (United States)",
+            "Ava - English (United States)",
+            "Brian - English (United States)",
+            "Emma - English (United States)",
+            "Florian - German (Germany)",
+            "Seraphina - German (Germany)",
+            "Remy - French (France)",
+            "Vivienne - French (France)"
+        ], value="Ava - English (United States)"),
         gr.Textbox(label="Your Gemini API Key (Optional) - In case you are getting rate limited"),
     ],
     outputs=[
+        gr.Audio(label="Generated Podcast Audio")
     ],
     title="PodcastGen 🎙️",
     description="Generate a 2-speaker podcast from text input or documents!",