import gradio as gr import speech_recognition as sr from pydub import AudioSegment # Function to transcribe audio to text def transcribe_audio(audio_input): recognizer = sr.Recognizer() audio_file = sr.AudioFile(audio_input) with audio_file as source: audio_data = recognizer.record(source) try: text = recognizer.recognize_google(audio_data) except sr.UnknownValueError: text = "Sorry, I couldn't understand the audio." except sr.RequestError: text = "Sorry, there was a problem with the request." return text # Function to generate a response (you'll need to implement this) def generate_response(user_input): # Placeholder for the text response generation and TTS part text_response = f"Responding as Tommy Vercetti: {user_input}" # Generate audio path based on text_response output_path = "response.wav" # Placeholder path # Implement TTS and save to output_path return text_response, output_path # Function to process the audio input and return both text and audio response def respond(audio_input): user_input = transcribe_audio(audio_input) text_response, output_path = generate_response(user_input) audio = AudioSegment.from_wav(output_path) duration = len(audio) / 1000 return text_response, output_path input_audio = gr.Audio( sources=["microphone"], waveform_options=gr.WaveformOptions( waveform_color="#01C6FF", waveform_progress_color="#0066B4", skip_length=2, show_controls=False, ), ) gr.Interface( fn=respond, inputs=input_audio, outputs=["text", "audio"], title="Tommy Vercetti Chatbot", description="Chat with Tommy Vercetti from GTA Vice City. Get responses in both text and voice!" ).launch(debug=True)