Cyleux commited on
Commit
3f0506d
·
verified ·
1 Parent(s): c12bbd2

Update spaces.py

Browse files
Files changed (1) hide show
  1. spaces.py +32 -20
spaces.py CHANGED
@@ -31,11 +31,8 @@ def get_voices():
31
  response = requests.get(url, headers=headers)
32
  if response.status_code == 200:
33
  voices = response.json()["voices"]
34
- voice_options = {voice["name"]: voice["voice_id"] for voice in voices}
35
- return voice_options
36
- else:
37
- print(f"Error getting voices: {response.status_code}")
38
- return {}
39
 
40
  def get_convo_list(description):
41
  prompt =f"Your task is to return a JSON object representing a complete conversation containing a key 'turns' with a value which is just a list of objects containing 'turn_number', an integer, and 'message', the message for that turn. Ensure you return as many turns as the user specifies, if they specify. Remember, each turn is a turn in a conversation between a phone agent (female) and a human (male). The phone agent should speak first. The conversation is described as:\n{description}.\nCritically, ensure that the human turns employ filler words (uh, uhhhh, ummmm, yeahhh, hm, hmm, etc with repeated letters to denote thinking...) and realistic language without using *sounds effects*. I repeat, do NOT use *sound effects*. Additionally, do not over-use filler words or start every human response with them. The goal is to sound realistic, not exagerrated. The AI should be conversational, employing transition phrases. The AI should always end their response with a question except when saying goodbye. Additionally, digits spaced out. For instance, the human might say: 'My phone number is 8 3 1... 5 4 8... 9 2 2 3...' instead of writing it out. They might also say 'My email is steve at gmail dot com.' where it is written out. Now provide the JSON."
@@ -173,10 +170,10 @@ def create_video_clip(image, duration, target_resolution=(1920, 1080)):
173
  return clip.set_duration(duration)
174
 
175
  def process_message(args):
176
- i, message, logo_image, female_voice_id, male_voice_id, male_stability, male_style = args
177
- voice_id = female_voice_id if i % 2 == 0 else male_voice_id
178
 
179
- if i % 2 == 0:
180
  text_color = "#cdfa8a"
181
  stability = 0.8
182
  style = 0
@@ -211,7 +208,7 @@ def process_message(args):
211
  print(f"Error processing message {i+1}: {e}")
212
  return (None, None, None)
213
 
214
- def generate_conversation_video(messages, female_voice_id, male_voice_id, logo_url, male_stability, male_style):
215
  logo_image = download_and_convert_svg_to_png(logo_url)
216
  if logo_image is None:
217
  return None
@@ -220,7 +217,7 @@ def generate_conversation_video(messages, female_voice_id, male_voice_id, logo_u
220
  audio_clips = []
221
  temp_audio_paths = []
222
 
223
- args = [(i, message, logo_image, female_voice_id, male_voice_id, male_stability, male_style) for i, message in enumerate(messages)]
224
  max_workers = 5
225
 
226
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -274,20 +271,31 @@ def generate_conversation_video(messages, female_voice_id, male_voice_id, logo_u
274
  return temp_video_path
275
 
276
  def generate_video(description, female_voice, male_voice, male_stability=0.65, male_style=0.35):
277
- voice_options = get_voices()
278
- female_voice_id = voice_options[female_voice]
279
- male_voice_id = voice_options[male_voice]
280
-
281
  logo_url = "https://opencall.ai/images/logo-symbol.svg"
282
 
283
  messages = get_convo_list(description)
284
- video_path = generate_conversation_video(messages, female_voice_id, male_voice_id, logo_url, male_stability, male_style)
285
 
286
  return video_path
287
 
288
- # Create Gradio interface
289
- voice_options = get_voices()
 
 
290
 
 
 
 
 
 
 
 
 
 
291
  iface = gr.Interface(
292
  fn=generate_video,
293
  inputs=[
@@ -298,13 +306,17 @@ iface = gr.Interface(
298
  info="You can be specific about the number of turns, tone, and content of the conversation"
299
  ),
300
  gr.Dropdown(
301
- choices=list(voice_options.keys()),
 
302
  label="Female Voice",
 
303
  info="Select the voice for the phone agent"
304
  ),
305
  gr.Dropdown(
306
- choices=list(voice_options.keys()),
307
- label="Male Voice",
 
 
308
  info="Select the voice for the customer"
309
  ),
310
  gr.Slider(
 
31
  response = requests.get(url, headers=headers)
32
  if response.status_code == 200:
33
  voices = response.json()["voices"]
34
+ return [(voice["name"], voice["voice_id"]) for voice in voices]
35
+ return []
 
 
 
36
 
37
  def get_convo_list(description):
38
  prompt =f"Your task is to return a JSON object representing a complete conversation containing a key 'turns' with a value which is just a list of objects containing 'turn_number', an integer, and 'message', the message for that turn. Ensure you return as many turns as the user specifies, if they specify. Remember, each turn is a turn in a conversation between a phone agent (female) and a human (male). The phone agent should speak first. The conversation is described as:\n{description}.\nCritically, ensure that the human turns employ filler words (uh, uhhhh, ummmm, yeahhh, hm, hmm, etc with repeated letters to denote thinking...) and realistic language without using *sounds effects*. I repeat, do NOT use *sound effects*. Additionally, do not over-use filler words or start every human response with them. The goal is to sound realistic, not exagerrated. The AI should be conversational, employing transition phrases. The AI should always end their response with a question except when saying goodbye. Additionally, digits spaced out. For instance, the human might say: 'My phone number is 8 3 1... 5 4 8... 9 2 2 3...' instead of writing it out. They might also say 'My email is steve at gmail dot com.' where it is written out. Now provide the JSON."
 
170
  return clip.set_duration(duration)
171
 
172
  def process_message(args):
173
+ i, message, logo_image, voice_ids, male_stability, male_style = args
174
+ voice_id = voice_ids[i % len(voice_ids)]
175
 
176
+ if i % len(voice_ids) == 0:
177
  text_color = "#cdfa8a"
178
  stability = 0.8
179
  style = 0
 
208
  print(f"Error processing message {i+1}: {e}")
209
  return (None, None, None)
210
 
211
+ def generate_conversation_video(messages, voice_ids, logo_url, male_stability, male_style):
212
  logo_image = download_and_convert_svg_to_png(logo_url)
213
  if logo_image is None:
214
  return None
 
217
  audio_clips = []
218
  temp_audio_paths = []
219
 
220
+ args = [(i, message, logo_image, voice_ids, male_stability, male_style) for i, message in enumerate(messages)]
221
  max_workers = 5
222
 
223
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
 
271
  return temp_video_path
272
 
273
  def generate_video(description, female_voice, male_voice, male_stability=0.65, male_style=0.35):
274
+ voice_ids = [
275
+ female_voice, # First speaker (female)
276
+ male_voice # Second speaker (male)
277
+ ]
278
  logo_url = "https://opencall.ai/images/logo-symbol.svg"
279
 
280
  messages = get_convo_list(description)
281
+ video_path = generate_conversation_video(messages, voice_ids, logo_url, male_stability, male_style)
282
 
283
  return video_path
284
 
285
+ # Get available voices
286
+ voices = get_voices()
287
+ default_female_id = "cgSgspJ2msm6clMCkdW9" # Default female voice ID
288
+ default_male_id = "3Niy6MUaDzcs7Liw7dFs" # Default male voice ID
289
 
290
+ # Create voice selection dropdowns
291
+ female_voice_names = [(voice[0], voice[1]) for voice in voices]
292
+ male_voice_names = [(voice[0], voice[1]) for voice in voices]
293
+
294
+ # Set default selections
295
+ default_female_idx = next((i for i, v in enumerate(female_voice_names) if v[1] == default_female_id), 0)
296
+ default_male_idx = next((i for i, v in enumerate(male_voice_names) if v[1] == default_male_id), 0)
297
+
298
+ # Create Gradio interface
299
  iface = gr.Interface(
300
  fn=generate_video,
301
  inputs=[
 
306
  info="You can be specific about the number of turns, tone, and content of the conversation"
307
  ),
308
  gr.Dropdown(
309
+ choices=female_voice_names,
310
+ value=female_voice_names[default_female_idx][1],
311
  label="Female Voice",
312
+ type="value",
313
  info="Select the voice for the phone agent"
314
  ),
315
  gr.Dropdown(
316
+ choices=male_voice_names,
317
+ value=male_voice_names[default_male_idx][1],
318
+ label="Male Voice",
319
+ type="value",
320
  info="Select the voice for the customer"
321
  ),
322
  gr.Slider(