Cyleux commited on
Commit
c12bbd2
·
verified ·
1 Parent(s): 9acc301

Update spaces.py

Browse files
Files changed (1) hide show
  1. spaces.py +22 -24
spaces.py CHANGED
@@ -30,10 +30,12 @@ def get_voices():
30
  }
31
  response = requests.get(url, headers=headers)
32
  if response.status_code == 200:
33
- return response.json()["voices"]
 
 
34
  else:
35
- print(f"Error getting voices: {response.status_code} - {response.text}")
36
- return []
37
 
38
  def get_convo_list(description):
39
  prompt =f"Your task is to return a JSON object representing a complete conversation containing a key 'turns' with a value which is just a list of objects containing 'turn_number', an integer, and 'message', the message for that turn. Ensure you return as many turns as the user specifies, if they specify. Remember, each turn is a turn in a conversation between a phone agent (female) and a human (male). The phone agent should speak first. The conversation is described as:\n{description}.\nCritically, ensure that the human turns employ filler words (uh, uhhhh, ummmm, yeahhh, hm, hmm, etc with repeated letters to denote thinking...) and realistic language without using *sounds effects*. I repeat, do NOT use *sound effects*. Additionally, do not over-use filler words or start every human response with them. The goal is to sound realistic, not exagerrated. The AI should be conversational, employing transition phrases. The AI should always end their response with a question except when saying goodbye. Additionally, digits spaced out. For instance, the human might say: 'My phone number is 8 3 1... 5 4 8... 9 2 2 3...' instead of writing it out. They might also say 'My email is steve at gmail dot com.' where it is written out. Now provide the JSON."
@@ -171,10 +173,10 @@ def create_video_clip(image, duration, target_resolution=(1920, 1080)):
171
  return clip.set_duration(duration)
172
 
173
  def process_message(args):
174
- i, message, logo_image, voice_ids, male_stability, male_style = args
175
- voice_id = voice_ids[i % len(voice_ids)]
176
 
177
- if i % len(voice_ids) == 0:
178
  text_color = "#cdfa8a"
179
  stability = 0.8
180
  style = 0
@@ -209,7 +211,7 @@ def process_message(args):
209
  print(f"Error processing message {i+1}: {e}")
210
  return (None, None, None)
211
 
212
- def generate_conversation_video(messages, voice_ids, logo_url, male_stability, male_style):
213
  logo_image = download_and_convert_svg_to_png(logo_url)
214
  if logo_image is None:
215
  return None
@@ -218,7 +220,7 @@ def generate_conversation_video(messages, voice_ids, logo_url, male_stability, m
218
  audio_clips = []
219
  temp_audio_paths = []
220
 
221
- args = [(i, message, logo_image, voice_ids, male_stability, male_style) for i, message in enumerate(messages)]
222
  max_workers = 5
223
 
224
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
@@ -271,23 +273,21 @@ def generate_conversation_video(messages, voice_ids, logo_url, male_stability, m
271
 
272
  return temp_video_path
273
 
274
- def generate_video(description, female_voice_id, male_voice_id, male_stability=0.65, male_style=0.35):
275
- voice_ids = [
276
- female_voice_id, # First speaker (female)
277
- male_voice_id # Second speaker (male)
278
- ]
279
  logo_url = "https://opencall.ai/images/logo-symbol.svg"
280
 
281
  messages = get_convo_list(description)
282
- video_path = generate_conversation_video(messages, voice_ids, logo_url, male_stability, male_style)
283
 
284
  return video_path
285
 
286
- # Get available voices
287
- available_voices = get_voices()
288
- voice_choices = [(voice["voice_id"], voice["name"]) for voice in available_voices]
289
-
290
  # Create Gradio interface
 
 
291
  iface = gr.Interface(
292
  fn=generate_video,
293
  inputs=[
@@ -298,16 +298,14 @@ iface = gr.Interface(
298
  info="You can be specific about the number of turns, tone, and content of the conversation"
299
  ),
300
  gr.Dropdown(
301
- choices=[choice[1] for choice in voice_choices],
302
- value=next(voice["name"] for voice in available_voices if voice["voice_id"] == "cgSgspJ2msm6clMCkdW9"),
303
  label="Female Voice",
304
- info="Select the voice for the female speaker"
305
  ),
306
  gr.Dropdown(
307
- choices=[choice[1] for choice in voice_choices],
308
- value=next(voice["name"] for voice in available_voices if voice["voice_id"] == "3Niy6MUaDzcs7Liw7dFs"),
309
  label="Male Voice",
310
- info="Select the voice for the male speaker"
311
  ),
312
  gr.Slider(
313
  minimum=0.1,
 
30
  }
31
  response = requests.get(url, headers=headers)
32
  if response.status_code == 200:
33
+ voices = response.json()["voices"]
34
+ voice_options = {voice["name"]: voice["voice_id"] for voice in voices}
35
+ return voice_options
36
  else:
37
+ print(f"Error getting voices: {response.status_code}")
38
+ return {}
39
 
40
  def get_convo_list(description):
41
  prompt =f"Your task is to return a JSON object representing a complete conversation containing a key 'turns' with a value which is just a list of objects containing 'turn_number', an integer, and 'message', the message for that turn. Ensure you return as many turns as the user specifies, if they specify. Remember, each turn is a turn in a conversation between a phone agent (female) and a human (male). The phone agent should speak first. The conversation is described as:\n{description}.\nCritically, ensure that the human turns employ filler words (uh, uhhhh, ummmm, yeahhh, hm, hmm, etc with repeated letters to denote thinking...) and realistic language without using *sounds effects*. I repeat, do NOT use *sound effects*. Additionally, do not over-use filler words or start every human response with them. The goal is to sound realistic, not exagerrated. The AI should be conversational, employing transition phrases. The AI should always end their response with a question except when saying goodbye. Additionally, digits spaced out. For instance, the human might say: 'My phone number is 8 3 1... 5 4 8... 9 2 2 3...' instead of writing it out. They might also say 'My email is steve at gmail dot com.' where it is written out. Now provide the JSON."
 
173
  return clip.set_duration(duration)
174
 
175
  def process_message(args):
176
+ i, message, logo_image, female_voice_id, male_voice_id, male_stability, male_style = args
177
+ voice_id = female_voice_id if i % 2 == 0 else male_voice_id
178
 
179
+ if i % 2 == 0:
180
  text_color = "#cdfa8a"
181
  stability = 0.8
182
  style = 0
 
211
  print(f"Error processing message {i+1}: {e}")
212
  return (None, None, None)
213
 
214
+ def generate_conversation_video(messages, female_voice_id, male_voice_id, logo_url, male_stability, male_style):
215
  logo_image = download_and_convert_svg_to_png(logo_url)
216
  if logo_image is None:
217
  return None
 
220
  audio_clips = []
221
  temp_audio_paths = []
222
 
223
+ args = [(i, message, logo_image, female_voice_id, male_voice_id, male_stability, male_style) for i, message in enumerate(messages)]
224
  max_workers = 5
225
 
226
  with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
 
273
 
274
  return temp_video_path
275
 
276
+ def generate_video(description, female_voice, male_voice, male_stability=0.65, male_style=0.35):
277
+ voice_options = get_voices()
278
+ female_voice_id = voice_options[female_voice]
279
+ male_voice_id = voice_options[male_voice]
280
+
281
  logo_url = "https://opencall.ai/images/logo-symbol.svg"
282
 
283
  messages = get_convo_list(description)
284
+ video_path = generate_conversation_video(messages, female_voice_id, male_voice_id, logo_url, male_stability, male_style)
285
 
286
  return video_path
287
 
 
 
 
 
288
  # Create Gradio interface
289
+ voice_options = get_voices()
290
+
291
  iface = gr.Interface(
292
  fn=generate_video,
293
  inputs=[
 
298
  info="You can be specific about the number of turns, tone, and content of the conversation"
299
  ),
300
  gr.Dropdown(
301
+ choices=list(voice_options.keys()),
 
302
  label="Female Voice",
303
+ info="Select the voice for the phone agent"
304
  ),
305
  gr.Dropdown(
306
+ choices=list(voice_options.keys()),
 
307
  label="Male Voice",
308
+ info="Select the voice for the customer"
309
  ),
310
  gr.Slider(
311
  minimum=0.1,