File size: 2,079 Bytes
896d8a0
f152a90
896d8a0
 
 
 
 
76084c0
 
896d8a0
 
 
 
76084c0
b207a62
896d8a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bfc5bba
896d8a0
 
 
 
 
 
 
 
 
 
 
b207a62
896d8a0
 
 
 
 
 
 
 
f152a90
 
896d8a0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from openai import OpenAI
import gradio as gr
from gtts import gTTS
import tempfile
import librosa
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch

# OpenRouter API Setup
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key="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"  # Replace with your OpenRouter API Key
)

def voice_assistant(audio_filepath):
    if audio_filepath is None:
        return "Please record your question.", None

    # Load Whisper model and processor
    processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3-turbo")
    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3-turbo")

    # Load audio data using librosa
    audio_data, sample_rate = librosa.load(audio_filepath, sr=16000)

    # Convert audio to text using Whisper 
    input_features = processor(audio_data, sampling_rate=sample_rate, return_tensors="pt").input_features 
    predicted_ids = model.generate(input_features)  
    user_voice = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

    # Generate AI response using OpenRouter
    completion = client.chat.completions.create(
        model="qwen/qwen2.5-vl-32b-instruct:free",
        messages=[{"role": "user", "content": user_voice}]
    )

    ai_response = completion.choices[0].message.content

    # Convert AI response to speech using gTTS
    tts = gTTS(ai_response, lang="en")
    temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
    tts.save(temp_audio.name)

    return ai_response, temp_audio.name  # ✅ Returning both text and audio

# Gradio Interface
iface = gr.Interface(
    fn=voice_assistant,
    inputs=gr.Audio(sources=["microphone"], type="filepath", label="Speak Your Question"), 
    outputs=[gr.Textbox(label="AI Response"), gr.Audio(label="Voice Response")],
    title="AI Voice Assistant",
    description="Speak or type a question, and the AI will respond with voice output.",
    live=True
)

iface.launch()