saq1b commited on
Commit
77ae41c
·
verified ·
1 Parent(s): 16d9e70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -125
app.py CHANGED
@@ -1,8 +1,6 @@
1
  import gradio as gr
2
- import logging
3
  from pydub import AudioSegment
4
  from google import genai # Using the new Gemini API client
5
- from google.genai import types # For inline file parts
6
  import json
7
  import uuid
8
  import io
@@ -12,12 +10,6 @@ import os
12
  import time
13
  import aiofiles
14
 
15
- # Set up logging
16
- logging.basicConfig(level=logging.INFO)
17
-
18
- # Maximum file size allowed: 20 MB
19
- MAX_FILE_SIZE = 20 * 1024 * 1024
20
-
21
  class PodcastGenerator:
22
  def __init__(self):
23
  pass
@@ -58,16 +50,18 @@ Follow this example structure:
58
  """
59
  user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
60
 
61
- # Initialize the Gemini API client with the provided API key.
62
  client = genai.Client(api_key=api_key)
63
  contents = []
64
  if file_data is not None:
65
  try:
66
- # Use inline file data directly without uploading.
67
- contents.append(types.Part.from_bytes(data=file_data, mime_type=file_mime_type))
 
 
68
  except Exception as e:
69
- logging.error("Error preparing file part: %s", e)
70
- raise gr.Error(f"Error processing file data: {e}")
71
  contents.append(user_prompt)
72
 
73
  config = {
@@ -84,7 +78,6 @@ Follow this example structure:
84
  config=config
85
  )
86
  except Exception as e:
87
- logging.error("API call failed: %s", e)
88
  if "API key not valid" in str(e):
89
  raise gr.Error("Invalid API key. Please provide a valid Gemini API key.")
90
  elif "rate limit" in str(e).lower():
@@ -92,134 +85,90 @@ Follow this example structure:
92
  else:
93
  raise gr.Error(f"Failed to generate podcast script: {e}")
94
 
95
- try:
96
- result = json.loads(response.text)
97
- except json.JSONDecodeError as e:
98
- logging.error("JSON parsing failed: %s", e)
99
- raise gr.Error(f"Response is not valid JSON: {e}")
100
-
101
- logging.info("Successfully generated script: %s", result)
102
- return result
103
 
104
  async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
105
  voice = speaker1 if speaker == 1 else speaker2
106
- try:
107
- speech = edge_tts.Communicate(text, voice)
108
- except Exception as e:
109
- logging.error("TTS initialization failed: %s", e)
110
- raise gr.Error(f"Text-to-Speech initialization error: {e}")
111
 
112
  temp_filename = f"temp_{uuid.uuid4()}.wav"
113
  try:
114
  await speech.save(temp_filename)
115
  return temp_filename
116
  except Exception as e:
117
- logging.error("TTS generation failed: %s", e)
118
  if os.path.exists(temp_filename):
119
  os.remove(temp_filename)
120
- raise gr.Error(f"Failed to generate speech for text: {e}")
121
 
122
  async def combine_audio_files(self, audio_files: list) -> str:
123
- try:
124
- combined_audio = AudioSegment.empty()
125
- for audio_file in audio_files:
126
- try:
127
- combined_audio += AudioSegment.from_file(audio_file)
128
- except Exception as inner_e:
129
- logging.error("Error processing audio file %s: %s", audio_file, inner_e)
130
- raise gr.Error(f"Error processing audio file: {inner_e}")
131
- finally:
132
- if os.path.exists(audio_file):
133
- os.remove(audio_file) # Clean up temporary file
134
- output_filename = f"output_{uuid.uuid4()}.wav"
135
- combined_audio.export(output_filename, format="wav")
136
- return output_filename
137
- except Exception as e:
138
- logging.error("Failed to combine audio files: %s", e)
139
- raise gr.Error(f"Failed to combine audio files: {e}")
140
 
141
- async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_data=None, file_mime_type=None) -> str:
142
- try:
143
- gr.Info("Generating podcast script...")
144
- start_time = time.time()
145
- podcast_json = await self.generate_script(input_text, language, api_key, file_data, file_mime_type)
146
- end_time = time.time()
147
- gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!")
148
- except Exception as e:
149
- logging.error("Script generation error: %s", e)
150
- raise gr.Error(f"Error generating podcast script: {e}")
151
-
152
- try:
153
- gr.Info("Generating podcast audio files...")
154
- start_time = time.time()
155
- audio_files = await asyncio.gather(*[
156
- self.tts_generate(item['line'], item['speaker'], speaker1, speaker2)
157
- for item in podcast_json.get('podcast', [])
158
- ])
159
- end_time = time.time()
160
- gr.Info(f"Successfully generated podcast audio files in {(end_time - start_time):.2f} seconds!")
161
- except Exception as e:
162
- logging.error("TTS generation error: %s", e)
163
- raise gr.Error(f"Error generating audio files: {e}")
164
 
165
- try:
166
- combined_audio = await self.combine_audio_files(audio_files)
167
- return combined_audio
168
- except Exception as e:
169
- logging.error("Audio combining error: %s", e)
170
- raise gr.Error(f"Error combining audio files: {e}")
171
-
172
- async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "") -> str:
173
- try:
174
- gr.Info("Starting podcast generation...")
175
  start_time = time.time()
 
 
 
176
 
177
- voice_names = {
178
- "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
179
- "Ava - English (United States)": "en-US-AvaMultilingualNeural",
180
- "Brian - English (United States)": "en-US-BrianMultilingualNeural",
181
- "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
182
- "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
183
- "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
184
- "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
185
- "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
186
- }
187
-
188
- speaker1 = voice_names.get(speaker1, speaker1)
189
- speaker2 = voice_names.get(speaker2, speaker2)
190
-
191
- file_data = None
192
- file_mime_type = None
193
- if input_file:
194
- ext = os.path.splitext(input_file.name)[1].lower()
195
- if ext not in ['.pdf', '.txt']:
196
- raise gr.Error("Unsupported file type. Only PDF and TXT files are allowed.")
197
- try:
198
- async with aiofiles.open(input_file.name, 'rb') as f:
199
- file_data = await f.read()
200
- except Exception as e:
201
- logging.error("Error reading file: %s", e)
202
- raise gr.Error(f"Error reading file: {e}")
203
- if len(file_data) > MAX_FILE_SIZE:
204
- raise gr.Error("File size exceeds 20MB limit.")
205
- file_mime_type = 'application/pdf' if ext == '.pdf' else 'text/plain'
206
-
207
- if not api_key:
208
- api_key = os.getenv("GENAI_API_KEY")
209
- if not api_key:
210
- raise gr.Error("No API key provided and none found in the environment.")
211
-
212
- podcast_generator = PodcastGenerator()
213
- podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key, file_data, file_mime_type)
214
-
215
  end_time = time.time()
216
- gr.Info(f"Successfully generated podcast in {(end_time - start_time):.2f} seconds!")
217
- return podcast
218
- except Exception as e:
219
- logging.error("Process input error: %s", e)
220
- raise gr.Error(f"Error in processing input: {e}")
221
 
222
- # Disable API generation to avoid schema-related errors.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  iface = gr.Interface(
224
  fn=process_input,
225
  inputs=[
@@ -280,8 +229,7 @@ iface = gr.Interface(
280
  ],
281
  title="PodcastGen 🎙️",
282
  description="Generate a 2-speaker podcast from text input or documents!",
283
- allow_flagging="never",
284
- allow_api=False # Disables API endpoints to avoid schema errors
285
  )
286
 
287
  if __name__ == "__main__":
 
1
  import gradio as gr
 
2
  from pydub import AudioSegment
3
  from google import genai # Using the new Gemini API client
 
4
  import json
5
  import uuid
6
  import io
 
10
  import time
11
  import aiofiles
12
 
 
 
 
 
 
 
13
  class PodcastGenerator:
14
  def __init__(self):
15
  pass
 
50
  """
51
  user_prompt = f"Please generate a podcast script based on the following user input:\n{prompt}"
52
 
53
+ # Initialize the client (it will pick up the provided API key)
54
  client = genai.Client(api_key=api_key)
55
  contents = []
56
  if file_data is not None:
57
  try:
58
+ uploaded_file = await client.aio.files.upload(
59
+ path=io.BytesIO(file_data),
60
+ config={"mime_type": file_mime_type}
61
+ )
62
  except Exception as e:
63
+ raise gr.Error(f"File upload failed: {e}")
64
+ contents.append(uploaded_file)
65
  contents.append(user_prompt)
66
 
67
  config = {
 
78
  config=config
79
  )
80
  except Exception as e:
 
81
  if "API key not valid" in str(e):
82
  raise gr.Error("Invalid API key. Please provide a valid Gemini API key.")
83
  elif "rate limit" in str(e).lower():
 
85
  else:
86
  raise gr.Error(f"Failed to generate podcast script: {e}")
87
 
88
+ print(f"Generated podcast script:\n{response.text}")
89
+ return json.loads(response.text)
 
 
 
 
 
 
90
 
91
  async def tts_generate(self, text: str, speaker: int, speaker1: str, speaker2: str) -> str:
92
  voice = speaker1 if speaker == 1 else speaker2
93
+ speech = edge_tts.Communicate(text, voice)
 
 
 
 
94
 
95
  temp_filename = f"temp_{uuid.uuid4()}.wav"
96
  try:
97
  await speech.save(temp_filename)
98
  return temp_filename
99
  except Exception as e:
 
100
  if os.path.exists(temp_filename):
101
  os.remove(temp_filename)
102
+ raise e
103
 
104
  async def combine_audio_files(self, audio_files: list) -> str:
105
+ combined_audio = AudioSegment.empty()
106
+ for audio_file in audio_files:
107
+ combined_audio += AudioSegment.from_file(audio_file)
108
+ os.remove(audio_file) # Clean up temporary files
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ output_filename = f"output_{uuid.uuid4()}.wav"
111
+ combined_audio.export(output_filename, format="wav")
112
+ return output_filename
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ async def generate_podcast(self, input_text: str, language: str, speaker1: str, speaker2: str, api_key: str, file_data=None, file_mime_type=None) -> str:
115
+ gr.Info("Generating podcast script...")
 
 
 
 
 
 
 
 
116
  start_time = time.time()
117
+ podcast_json = await self.generate_script(input_text, language, api_key, file_data, file_mime_type)
118
+ end_time = time.time()
119
+ gr.Info(f"Successfully generated podcast script in {(end_time - start_time):.2f} seconds!")
120
 
121
+ gr.Info("Generating podcast audio files...")
122
+ start_time = time.time()
123
+ audio_files = await asyncio.gather(*[
124
+ self.tts_generate(item['line'], item['speaker'], speaker1, speaker2)
125
+ for item in podcast_json['podcast']
126
+ ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  end_time = time.time()
128
+ gr.Info(f"Successfully generated podcast audio files in {(end_time - start_time):.2f} seconds!")
129
+
130
+ combined_audio = await self.combine_audio_files(audio_files)
131
+ return combined_audio
 
132
 
133
+ async def process_input(input_text: str, input_file, language: str, speaker1: str, speaker2: str, api_key: str = "") -> str:
134
+ gr.Info("Starting podcast generation...")
135
+ start_time = time.time()
136
+
137
+ voice_names = {
138
+ "Andrew - English (United States)": "en-US-AndrewMultilingualNeural",
139
+ "Ava - English (United States)": "en-US-AvaMultilingualNeural",
140
+ "Brian - English (United States)": "en-US-BrianMultilingualNeural",
141
+ "Emma - English (United States)": "en-US-EmmaMultilingualNeural",
142
+ "Florian - German (Germany)": "de-DE-FlorianMultilingualNeural",
143
+ "Seraphina - German (Germany)": "de-DE-SeraphinaMultilingualNeural",
144
+ "Remy - French (France)": "fr-FR-RemyMultilingualNeural",
145
+ "Vivienne - French (France)": "fr-FR-VivienneMultilingualNeural"
146
+ }
147
+
148
+ speaker1 = voice_names[speaker1]
149
+ speaker2 = voice_names[speaker2]
150
+
151
+ file_data = None
152
+ file_mime_type = None
153
+ if input_file:
154
+ ext = os.path.splitext(input_file.name)[1].lower()
155
+ if ext not in ['.pdf', '.txt']:
156
+ raise gr.Error("Unsupported file type. Only PDF and TXT files are allowed.")
157
+ async with aiofiles.open(input_file.name, 'rb') as f:
158
+ file_data = await f.read()
159
+ file_mime_type = 'application/pdf' if ext == '.pdf' else 'text/plain'
160
+
161
+ if not api_key:
162
+ api_key = os.getenv("GENAI_API_KEY")
163
+
164
+ podcast_generator = PodcastGenerator()
165
+ podcast = await podcast_generator.generate_podcast(input_text, language, speaker1, speaker2, api_key, file_data, file_mime_type)
166
+
167
+ end_time = time.time()
168
+ gr.Info(f"Successfully generated podcast in {(end_time - start_time):.2f} seconds!")
169
+ return podcast
170
+
171
+ # Define Gradio interface
172
  iface = gr.Interface(
173
  fn=process_input,
174
  inputs=[
 
229
  ],
230
  title="PodcastGen 🎙️",
231
  description="Generate a 2-speaker podcast from text input or documents!",
232
+ allow_flagging="never"
 
233
  )
234
 
235
  if __name__ == "__main__":