Cyleux commited on
Commit
f1c85e2
·
verified ·
1 Parent(s): 7d0da76

Update spaces.py

Browse files
Files changed (1) hide show
  1. spaces.py +31 -3
spaces.py CHANGED
@@ -23,6 +23,18 @@ client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
23
  # ElevenLabs API key
24
  elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
25
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def get_convo_list(description):
27
  prompt =f"Your task is to return a JSON object representing a complete conversation containing a key 'turns' with a value which is just a list of objects containing 'turn_number', an integer, and 'message', the message for that turn. Ensure you return as many turns as the user specifies, if they specify. Remember, each turn is a turn in a conversation between a phone agent (female) and a human (male). The phone agent should speak first. The conversation is described as:\n{description}.\nCritically, ensure that the human turns employ filler words (uh, uhhhh, ummmm, yeahhh, hm, hmm, etc with repeated letters to denote thinking...) and realistic language without using *sounds effects*. I repeat, do NOT use *sound effects*. Additionally, do not over-use filler words or start every human response with them. The goal is to sound realistic, not exagerrated. The AI should be conversational, employing transition phrases. The AI should always end their response with a question except when saying goodbye. Additionally, digits spaced out. For instance, the human might say: 'My phone number is 8 3 1... 5 4 8... 9 2 2 3...' instead of writing it out. They might also say 'My email is steve at gmail dot com.' where it is written out. Now provide the JSON."
28
  new_output = ""
@@ -259,10 +271,10 @@ def generate_conversation_video(messages, voice_ids, logo_url, male_stability, m
259
 
260
  return temp_video_path
261
 
262
- def generate_video(description, male_stability=0.65, male_style=0.35):
263
  voice_ids = [
264
- "cgSgspJ2msm6clMCkdW9", # First speaker
265
- "3Niy6MUaDzcs7Liw7dFs" # Second speaker
266
  ]
267
  logo_url = "https://opencall.ai/images/logo-symbol.svg"
268
 
@@ -271,6 +283,10 @@ def generate_video(description, male_stability=0.65, male_style=0.35):
271
 
272
  return video_path
273
 
 
 
 
 
274
  # Create Gradio interface
275
  iface = gr.Interface(
276
  fn=generate_video,
@@ -281,6 +297,18 @@ iface = gr.Interface(
281
  placeholder="Describe the conversation you want to generate...",
282
  info="You can be specific about the number of turns, tone, and content of the conversation"
283
  ),
 
 
 
 
 
 
 
 
 
 
 
 
284
  gr.Slider(
285
  minimum=0.1,
286
  maximum=1.0,
 
23
  # ElevenLabs API key
24
  elevenlabs_api_key = os.getenv("ELEVENLABS_API_KEY")
25
 
26
+ def get_voices():
27
+ url = "https://api.elevenlabs.io/v1/voices"
28
+ headers = {
29
+ "xi-api-key": elevenlabs_api_key
30
+ }
31
+ response = requests.get(url, headers=headers)
32
+ if response.status_code == 200:
33
+ return response.json()["voices"]
34
+ else:
35
+ print(f"Error getting voices: {response.status_code} - {response.text}")
36
+ return []
37
+
38
  def get_convo_list(description):
39
  prompt =f"Your task is to return a JSON object representing a complete conversation containing a key 'turns' with a value which is just a list of objects containing 'turn_number', an integer, and 'message', the message for that turn. Ensure you return as many turns as the user specifies, if they specify. Remember, each turn is a turn in a conversation between a phone agent (female) and a human (male). The phone agent should speak first. The conversation is described as:\n{description}.\nCritically, ensure that the human turns employ filler words (uh, uhhhh, ummmm, yeahhh, hm, hmm, etc with repeated letters to denote thinking...) and realistic language without using *sounds effects*. I repeat, do NOT use *sound effects*. Additionally, do not over-use filler words or start every human response with them. The goal is to sound realistic, not exagerrated. The AI should be conversational, employing transition phrases. The AI should always end their response with a question except when saying goodbye. Additionally, digits spaced out. For instance, the human might say: 'My phone number is 8 3 1... 5 4 8... 9 2 2 3...' instead of writing it out. They might also say 'My email is steve at gmail dot com.' where it is written out. Now provide the JSON."
40
  new_output = ""
 
271
 
272
  return temp_video_path
273
 
274
+ def generate_video(description, female_voice_id, male_voice_id, male_stability=0.65, male_style=0.35):
275
  voice_ids = [
276
+ female_voice_id, # First speaker (female)
277
+ male_voice_id # Second speaker (male)
278
  ]
279
  logo_url = "https://opencall.ai/images/logo-symbol.svg"
280
 
 
283
 
284
  return video_path
285
 
286
+ # Get available voices
287
+ available_voices = get_voices()
288
+ voice_choices = [(voice["voice_id"], voice["name"]) for voice in available_voices]
289
+
290
  # Create Gradio interface
291
  iface = gr.Interface(
292
  fn=generate_video,
 
297
  placeholder="Describe the conversation you want to generate...",
298
  info="You can be specific about the number of turns, tone, and content of the conversation"
299
  ),
300
+ gr.Dropdown(
301
+ choices=[choice[0] for choice in voice_choices],
302
+ value="cgSgspJ2msm6clMCkdW9",
303
+ label="Female Voice",
304
+ info="Select the voice for the female speaker"
305
+ ),
306
+ gr.Dropdown(
307
+ choices=[choice[0] for choice in voice_choices],
308
+ value="3Niy6MUaDzcs7Liw7dFs",
309
+ label="Male Voice",
310
+ info="Select the voice for the male speaker"
311
+ ),
312
  gr.Slider(
313
  minimum=0.1,
314
  maximum=1.0,