import gradio as gr import openai, config, subprocess import requests import json messages = [{"role": "system", "content": 'You are Steve Jobs. Respond to all input in 25 words or less.'}] # Set up the API endpoint URL and headers url = f"https://api.elevenlabs.io/v1/text-to-speech/{os.environ.get('voice_id')}/stream" headers = { "accept": "*/*", "xi-api-key": os.environ.get('elevenlabs_api_key'), "Content-Type": "application/json", } # Define a function to handle the Gradio input and generate the response def transcribe(audio): global messages # Use OpenAI to transcribe the user's audio input # API call 1 audio_file = open(audio, "rb") transcript = openai.Audio.transcribe("whisper-1", audio_file) # Append the user's message to the message history messages.append({"role": "user", "content": transcript["text"]}) # Generate a response using OpenAI's chat API #API call 2 response = openai.ChatCompletion.create(model="gpt-3.5-turbo", messages=messages) # Extract the system message from the API response and append it to the message history system_message = response["choices"][0]["message"] messages.append(system_message) #API Call 3 # Use the voice synthesis API to generate an audio response from the system message data = { "text": system_message["content"], "voice_settings": { "stability": 0, "similarity_boost": 0 } } response = requests.post(url, headers=headers, data=json.dumps(data), stream=True) # Save the audio response to a file if response.ok: with open("output.wav", "wb") as f: for chunk in response.iter_content(chunk_size=1024): f.write(chunk) else: print(f"Error: {response.status_code} - {response.reason}") IPython.display.display(IPython.display.Audio('output.wav')) # Generate a chat transcript for display in the Gradio UI chat_transcript = "" for message in messages: if message['role'] != 'system': chat_transcript += message['role'] + ": " + message['content'] + "\n\n" return chat_transcript,'output.wav' # Define the Gradio UI interface # ui = gr.Interface(fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text") ui = gr.Interface(fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs=['text','audio']) ui.launch(share=True)