import gradio as gr from pydub import AudioSegment from google import genai # Using the new Gemini API client import json import uuid import io import edge_tts import asyncio import os import time import aiofiles class PodcastGenerator: def __init__(self): pass async def generate_script(self, prompt: str, language: str, api_key: str, file_data=None, file_mime_type=None) -> dict: example = """ { "topic": "AGI", "podcast": [ { "speaker": 2, "line": "So, AGI, huh? Seems like everyone's talking about it these days." }, { "speaker": 1, "line": "Yeah, it's definitely having a moment, isn't it?" } // ... (rest of the example) ] } """ if language == "Auto Detect": language_instruction = "- The podcast MUST be in the same language as the user input." else: language_instruction = f"- The podcast MUST be in {language} language" system_prompt = f""" You are a professional podcast generator. Your task is to generate a professional podcast script based on the user input. {language_instruction} - The podcast should have 2 speakers. - The podcast should be long. - Do not use names for the speakers. - The podcast should be interesting, lively, and engaging, and hook the listener from the start. - The input text might be disorganized or unformatted, originating from sources like PDFs or text files. Ignore any formatting inconsistencies or irrelevant details; your task is to distill the essential points, identify key definitions, and highlight intriguing facts that would be suitable for discussion in a podcast. - The script must be in JSON format. Follow this example structure: {example} """ user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}" # Initialize the client (it will pick up the provided API key) client = genai.Client(api_key=api_key) contents = [] if file_data is not None: try: uploaded_file = await client.aio.files.upload( path=io.BytesIO(file_data), config={"mime_type": file_mime_type} ) except Exception as e: raise gr.Error(f"File upload failed: {e}") contents.append(uploaded_file) contents.append(user_prompt) config = { "system_instruction": system_prompt, "temperature": 1, "max_output_tokens": 8192, "response_mime_type": "application/json", } try: response = await client.aio.models.generate_content( model="gemini-2.0-flash", contents=contents, config=config ) except Exception as e: if "API key not valid" in str(e): raise gr.Error("Invalid API key. Please provide a valid Gemini API key.") elif "rate limit" in str(e).lower(): raise gr.Error("Rate limit exceeded for the API key. Please try again later or provide your own Gemini API key.") else: raise gr.Error(f"Failed to generate podcast script: {e}") print(f"Generated podcast script:\n{response.text}") return json.loads(response.text) async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str: voice = speaker1 if speaker == 1 else speaker2 speech = edge_tts.Communicate(text, voice) temp_filename = f"temp_{uuid.uuid4()}.wav" try: await speech.save(temp_filename) return temp_filename except Exception as e: if os.path.exists(temp_filename): os.remove(temp_filename) raise e async def combine_audio_files(self, audio_files: list) -> str: combined_audio = AudioSegment.empty() for audio_file in audio_files: combined_audio += AudioSegment.from_file(audio_file) os.remove(audio_file) # Clean up temporary files output_filename = f"output_{uuid.uuid4()}.wav" combined_audio.export(output_filename, format="wav") return output_filename async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_data=None, file_mime_type=None) -> str: gr.Info("Generating podcast script...") start_time = time.time() podcast_json = await self.generate_script(input_text, language, api_key, file_data, file_mime_type) end_time = time.time() gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!") gr.Info("Generating podcast audio files...") start_time = time.time() audio_files = await asyncio.gather(*[ self.tts_generate(item['line'], item['speaker'], speaker1, speaker2) for item in podcast_json['podcast'] ]) end_time = time.time() gr.Info(f"Successfully generated podcast audio files in {(end_time - start_time):.2f} seconds!") combined_audio = await self.combine_audio_files(audio_files) return combined_audio async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "") -> str: gr.Info("Starting podcast generation...") start_time = time.time() voice_names = { "Andrew - English (United States)": "en-US-AndrewMultilingualNeural", "Ava - English (United States)": "en-US-AvaMultilingualNeural", "Brian - English (United States)": "en-US-BrianMultilingualNeural", "Emma - English (United States)": "en-US-EmmaMultilingualNeural", "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural", "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural", "Remy - French (France)": "fr-FR-RemyMultilingualNeural", "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural" } speaker1 = voice_names[speaker1] speaker2 = voice_names[speaker2] file_data = None file_mime_type = None if input_file: ext = os.path.splitext(input_file.name)[1].lower() if ext not in ['.pdf', '.txt']: raise gr.Error("Unsupported file type. Only PDF and TXT files are allowed.") async with aiofiles.open(input_file.name, 'rb') as f: file_data = await f.read() file_mime_type = 'application/pdf' if ext == '.pdf' else 'text/plain' if not api_key: api_key = os.getenv("GENAI_API_KEY") podcast_generator = PodcastGenerator() podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key, file_data, file_mime_type) end_time = time.time() gr.Info(f"Successfully generated podcast in {(end_time - start_time):.2f} seconds!") return podcast # Define Gradio interface iface = gr.Interface( fn=process_input, inputs=[ gr.Textbox(label="Input Text"), gr.File(label="Or Upload a PDF or TXT file"), gr.Dropdown( label="Language", choices=[ "Auto Detect", "Afrikaans", "Albanian", "Amharic", "Arabic", "Armenian", "Azerbaijani", "Bahasa Indonesian", "Bangla", "Basque", "Bengali", "Bosnian", "Bulgarian", "Burmese", "Catalan", "Chinese Cantonese", "Chinese Mandarin", "Chinese Taiwanese", "Croatian", "Czech", "Danish", "Dutch", "English", "Estonian", "Filipino", "Finnish", "French", "Galician", "Georgian", "German", "Greek", "Hebrew", "Hindi", "Hungarian", "Icelandic", "Irish", "Italian", "Japanese", "Javanese", "Kannada", "Kazakh", "Khmer", "Korean", "Lao", "Latvian", "Lithuanian", "Macedonian", "Malay", "Malayalam", "Maltese", "Mongolian", "Nepali", "Norwegian Bokmål", "Pashto", "Persian", "Polish", "Portuguese", "Romanian", "Russian", "Serbian", "Sinhala", "Slovak", "Slovene", "Somali", "Spanish", "Sundanese", "Swahili", "Swedish", "Tamil", "Telugu", "Thai", "Turkish", "Ukrainian", "Urdu", "Uzbek", "Vietnamese", "Welsh", "Zulu" ], value="Auto Detect" ), gr.Dropdown( label="Speaker 1 Voice", choices=[ "Andrew - English (United States)", "Ava - English (United States)", "Brian - English (United States)", "Emma - English (United States)", "Florian - German (Germany)", "Seraphina - German (Germany)", "Remy - French (France)", "Vivienne - French (France)" ], value="Andrew - English (United States)" ), gr.Dropdown( label="Speaker 2 Voice", choices=[ "Andrew - English (United States)", "Ava - English (United States)", "Brian - English (United States)", "Emma - English (United States)", "Florian - German (Germany)", "Seraphina - German (Germany)", "Remy - French (France)", "Vivienne - French (France)" ], value="Ava - English (United States)" ), gr.Textbox(label="Your Gemini API Key (Optional) - In case you are getting rate limited"), ], outputs=[ gr.Audio(label="Generated Podcast Audio") ], title="PodcastGen 🎙️", description="Generate a 2-speaker podcast from text input or documents!", allow_flagging="never" ) if __name__ == "__main__": iface.launch()