akshayvkt's picture
Update app.py
e2c2451
raw
history blame
3.2 kB
import gradio as gr
import openai
import requests
import json
import os
openai.api_key = os.environ.get('OPENAI_API_KEY')
gr.HTML("""
<div style="text-align: center; max-width: 700px; margin: 0 auto;">
<div
style="
display: inline-flex;
align-items: center;
gap: 0.8rem;
font-size: 1.75rem;
"
>
<h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
Talk to AI Steve Jobs: Audio-to-Text+Audio generation
</h1>
</div>
</div>
<p>
Have a back-and-forth conversation with AI Steve Jobs, powered by ChatGPT + Whisper + ElevenLabs + HuggingFace
<br/>
<p/>
""")
messages = [{"role": "system", "content": 'You are Steve Jobs. Respond to all input in 25 words or less.'}]
# Set up the API endpoint URL and headers
url = f"https://api.elevenlabs.io/v1/text-to-speech/{os.environ.get('voice_id')}/stream"
headers = {
"accept": "*/*",
"xi-api-key": os.environ.get('elevenlabs_api_key'),
"Content-Type": "application/json",
}
# Define a function to handle the Gradio input and generate the response
def transcribe(audio):
global messages
# Use OpenAI to transcribe the user's audio input
# API call 1
audio_file = open(audio, "rb")
transcript = openai.Audio.transcribe("whisper-1", audio_file)
# Append the user's message to the message history
messages.append({"role": "user", "content": transcript["text"]})
# Generate a response using OpenAI's chat API
#API call 2
response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages)
# Extract the system message from the API response and append it to the message history
system_message = response["choices"][0]["message"]
messages.append(system_message)
#API Call 3
# Use the voice synthesis API to generate an audio response from the system message
data = {
"text": system_message["content"],
"voice_settings": {
"stability": 0,
"similarity_boost": 0
}
}
response = requests.post(url, headers=headers, data=json.dumps(data), stream=True)
# Save the audio response to a file
if response.ok:
with open("output.wav", "wb") as f:
for chunk in response.iter_content(chunk_size=1024):
f.write(chunk)
else:
print(f"Error: {response.status_code} - {response.reason}")
# IPython.display.display(IPython.display.Audio('output.wav'))
# Generate a chat transcript for display in the Gradio UI
chat_transcript = ""
for message in messages:
if message['role'] != 'system':
chat_transcript += message['role'] + ": " + message['content'] + "\n\n"
return chat_transcript,'output.wav'
# Define the Gradio UI interface
# ui = gr.Interface(fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text")
ui = gr.Interface(fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs=['text','audio'])
ui.launch(debug=True)