from openai import OpenAI import gradio as gr from gtts import gTTS import tempfile import librosa import numpy as np from transformers import WhisperProcessor, WhisperForConditionalGeneration import torch # OpenRouter API Setup client = OpenAI( base_url="https://openrouter.ai/api/v1", api_key="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" # Replace with your OpenRouter API Key ) def voice_assistant(audio_filepath): if audio_filepath is None: return "Please record your question.", None # Load Whisper model and processor processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3-turbo") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3-turbo") # Load audio data using librosa audio_data, sample_rate = librosa.load(audio_filepath, sr=16000) # Convert audio to text using Whisper input_features = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features predicted_ids = model.generate(input_features) user_voice = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] # Generate AI response using OpenRouter completion = client.chat.completions.create( model="qwen/qwen2.5-vl-32b-instruct:free", messages=[{"role": "user", "content": user_voice}] ) ai_response = completion.choices[0].message.content # Convert AI response to speech using gTTS tts = gTTS(ai_response, lang="en") temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") tts.save(temp_audio.name) return ai_response, temp_audio.name # ✅ Returning both text and audio # Gradio Interface iface = gr.Interface( fn=voice_assistant, inputs=gr.Audio(sources=["microphone"], type="filepath", label="Speak Your Question"), outputs=[gr.Textbox(label="AI Response"), gr.Audio(label="Voice Response")], title="AI Voice Assistant", description="Speak or type a question, and the AI will respond with voice output.", live=True ) iface.launch()