File size: 4,451 Bytes
1aef621
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716173c
1aef621
 
 
 
46bf33d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5053ef7
46bf33d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1aef621
 
 
 
 
 
 
716173c
1aef621
 
 
 
716173c
1aef621
 
 
716173c
5053ef7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716173c
1aef621
 
 
 
 
ca1f683
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import gradio as gr
import asyncio
import edge_tts
import speech_recognition as sr
from pydub import AudioSegment
from pydub.playback import play
import os
from huggingface_hub import InferenceClient
import whisper
import torch
from io import BytesIO
import tempfile

# Get the Hugging Face token from environment variable
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN environment variable is not set")

# Initialize the Hugging Face Inference Client
client = InferenceClient(
    "mistralai/Mistral-Nemo-Instruct-2407",
    token=hf_token
)

# Load the Whisper model
whisper_model = whisper.load_model("tiny.en", device='cpu')

# Initialize an empty chat history
chat_history = []

async def text_to_speech_stream(text):
    """Convert text to speech using edge_tts and return the audio file path."""
    communicate = edge_tts.Communicate(text, "en-US-AvaMultilingualNeural")
    audio_data = b""

    async for chunk in communicate.stream():
        if chunk["type"] == "audio":
            audio_data += chunk["data"]

    # Save the audio data to a temporary file
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
        temp_file.write(audio_data)
        return temp_file.name

def whisper_speech_to_text(audio):
    """Convert speech to text using Whisper model."""
    try:
        result = whisper_model.transcribe(audio)
        text = result['text']
        return text
    except Exception as e:
        print(f"Whisper Error: {e}")
        return None
    finally:
        # Clear CUDA cache
        torch.cuda.empty_cache()

async def chat_with_ai(message, history):
    global chat_history
    
    # Add user message to chat history
    chat_history.append({"role": "user", "content": message})
    
    try:
        # Send chat completion request
        response = client.chat_completion(
            messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
            max_tokens=800,
            temperature=0.7
        )
        
        response_text = response.choices[0].message['content']
        
        # Add assistant's response to chat history
        chat_history.append({"role": "assistant", "content": response_text})
        
        # Generate speech for the response
        audio_path = await text_to_speech_stream(response_text)
        
        return response_text, audio_path
    except Exception as e:
        print(f"Error: {e}")
        return str(e), None

def transcribe_and_chat(audio):
    # Transcribe audio to text
    text = whisper_speech_to_text(audio)
    if text is None:
        return "Sorry, I couldn't understand the audio.", None
    
    # Chat with AI using the transcribed text
    response, audio_path = asyncio.run(chat_with_ai(text, []))
    return response, audio_path

# Define the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# AI Voice Assistant")
    
    with gr.Row():
        with gr.Column():
            audio_input = gr.Audio(type="filepath", label="Speak here", interactive=False)
            text_input = gr.Textbox(label="Or type your message here")
        
        with gr.Column():
            chat_output = gr.Textbox(label="AI Response")
            audio_output = gr.Audio(label="AI Voice Response", interactive=False)
    
    audio_button = gr.Button("Send Audio")
    text_button = gr.Button("Send Text")

    # Add custom JavaScript to handle spacebar press and play audio automatically
    demo.append(gr.HTML("""
        <script>
            document.addEventListener('keydown', function(event) {
                if (event.code === 'Space') {
                    document.querySelector('input[type="file"]').click();
                }
            });
            
            document.addEventListener('gradioAudioLoaded', function(event) {
                var audioElement = document.querySelector('audio');
                if (audioElement) {
                    audioElement.play();
                }
            });
        </script>
    """))

    audio_button.click(transcribe_and_chat, inputs=audio_input, outputs=[chat_output, audio_output])
    text_button.click(lambda x: asyncio.run(chat_with_ai(x, [])), inputs=text_input, outputs=[chat_output, audio_output])

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)