File size: 9,000 Bytes
1aef621
 
 
 
 
e927cd3
1aef621
27180cc
41fe818
 
27180cc
 
 
3bcf6d8
1aef621
3bcf6d8
1aef621
 
 
e927cd3
 
1aef621
e927cd3
 
 
1aef621
 
 
 
285990a
15450b6
285990a
46bf33d
 
 
 
 
15450b6
 
8711dc0
 
15450b6
 
 
 
46bf33d
41fe818
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
285990a
41fe818
 
 
 
 
 
285990a
41fe818
 
 
 
 
 
 
15450b6
 
 
41fe818
 
15450b6
41fe818
285990a
41fe818
15450b6
41fe818
 
 
15450b6
41fe818
15450b6
 
1aef621
3bcf6d8
99f8018
64c39dc
2752725
 
 
 
 
 
 
 
64c39dc
b598ccb
64c39dc
 
b598ccb
99f8018
960d9a4
 
2752725
 
 
99f8018
 
960d9a4
3bcf6d8
c64fa8b
3bcf6d8
8d734b4
 
 
3bcf6d8
89f2c8d
8d734b4
15450b6
960d9a4
2752725
 
c64fa8b
2752725
8d734b4
27180cc
 
8d734b4
15450b6
 
 
 
 
8d734b4
15450b6
8d734b4
960d9a4
8d734b4
3bcf6d8
 
1163b60
2752725
1163b60
 
2752725
1163b60
 
 
 
2752725
1163b60
 
 
2752725
 
 
1163b60
 
 
 
 
8d734b4
1163b60
e255e64
 
aac3f56
8d734b4
 
 
 
 
 
3bcf6d8
aac3f56
 
8d734b4
 
 
 
aac3f56
64c39dc
 
 
2752725
64c39dc
 
 
 
 
8711dc0
3bcf6d8
 
 
 
1aef621
64c39dc
 
 
2752725
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import gradio as gr
import asyncio
import edge_tts
import os
from huggingface_hub import InferenceClient
import requests
import tempfile
import logging
import io
from pydub import AudioSegment

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Get the Hugging Face token from environment variable
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN environment variable is not set")

# Initialize the Hugging Face Inference Client for chat completion
chat_client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=hf_token)

# Whisper API settings
WHISPER_API_URL = "https://api-inference.huggingface.co/models/openai/whisper-large-v3-turbo"
headers = {"Authorization": f"Bearer {hf_token}"}

# Initialize an empty chat history
chat_history = []

async def text_to_speech_stream(text, voice_volume=1.0):
    """Convert text to speech using edge_tts and return the audio file path."""
    communicate = edge_tts.Communicate(text, "en-US-BrianMultilingualNeural")
    audio_data = b""

    async for chunk in communicate.stream():
        if chunk["type"] == "audio":
            audio_data += chunk["data"]

    # Adjust volume
    audio = AudioSegment.from_mp3(io.BytesIO(audio_data))
    adjusted_audio = audio + (20 * voice_volume - 20)  # Adjust volume (0.0 to 2.0)

    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
        adjusted_audio.export(temp_file.name, format="mp3")
        return temp_file.name

def whisper_speech_to_text(audio_path):
    """Convert speech to text using Hugging Face Whisper API."""
    if audio_path is None:
        logging.error("Error: No audio file provided")
        return ""
    
    if not os.path.exists(audio_path):
        logging.error(f"Error: Audio file not found at {audio_path}")
        return ""
    
    try:
        with open(audio_path, "rb") as audio_file:
            data = audio_file.read()
        response = requests.post(WHISPER_API_URL, headers=headers, data=data)
        response.raise_for_status()  # Raise an exception for bad status codes
        result = response.json()
        transcribed_text = result.get("text", "")
        logging.info(f"Transcribed text: {transcribed_text}")
        return transcribed_text
    except requests.exceptions.RequestException as e:
        logging.error(f"Error during API request: {e}")
        return ""
    except Exception as e:
        logging.error(f"Unexpected error in whisper_speech_to_text: {e}")
        return ""

async def chat_with_ai(message):
    global chat_history
    
    chat_history.append({"role": "user", "content": message})
    
    try:
        response = chat_client.chat_completion(
            messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
            max_tokens=800,
            temperature=0.7
        )
        
        response_text = response.choices[0].message['content']
        chat_history.append({"role": "assistant", "content": response_text})
        
        audio_path = await text_to_speech_stream(response_text)
        
        return response_text, audio_path
    except Exception as e:
        logging.error(f"Error in chat_with_ai: {e}")
        return str(e), None

def transcribe_and_chat(audio):
    if audio is None:
        return "Sorry, no audio was provided. Please try recording again.", None
    
    text = whisper_speech_to_text(audio)
    if not text:
        return "Sorry, I couldn't understand the audio or there was an error in transcription. Please try again.", None
    
    response, audio_path = asyncio.run(chat_with_ai(text))
    return response, audio_path

def create_demo():
    with gr.Blocks(css="""
        @import url('https://fonts.googleapis.com/css2?family=Poppins:wght@400;500;700&display=swap');
        body { font-family: 'Poppins', sans-serif; margin: 0; padding: 0; box-sizing: border-box;}
        #audio-input {border: 2px solid #ffb703; padding: 10px;}
        #chat-output {background-color: #023047; color: #ffffff; font-size: 1.2em;}
        #audio-output {border: 2px solid #8ecae6;}
        #clear-button {background-color: #fb8500; color: white;}
        #voice-volume {background-color: #219ebc;}
        button {font-size: 16px;}
        audio {background-color: #ffb703; border-radius: 10px;}
        footer {display: none;}
        @media (max-width: 768px) {
            #audio-input, #chat-output, #audio-output { width: 100%; }
            button { width: 100%; }
        }
    """) as demo:
        gr.Markdown(
            """
            <div style='text-align:center; color:#023047; font-size: 28px; font-weight: bold;'>πŸ—£οΈ AI Voice Assistant</div>
            <p style='text-align:center; color:#8ecae6; font-size: 18px;'>Talk to your personal AI! Record your voice, and get a response in both text and audio.</p>
            <p style='text-align:center; color:#8ecae6;'>Powered by advanced AI models for real-time interaction.</p>
            """, 
            elem_id="header"
        )

        with gr.Row():
            with gr.Column(scale=1):
                audio_input = gr.Audio(type="filepath", label="🎀 Record your voice", elem_id="audio-input")
                clear_button = gr.Button("Clear", variant="secondary", elem_id="clear-button")
                voice_volume = gr.Slider(minimum=0, maximum=2, value=1, step=0.1, label="Voice Volume", elem_id="voice-volume")

            with gr.Column(scale=1):
                chat_output = gr.Textbox(label="πŸ’¬ AI Response", elem_id="chat-output", lines=5, interactive=False)
                audio_output = gr.Audio(label="πŸ”Š AI Voice Response", autoplay=True, elem_id="audio-output")

        # Add some spacing and a divider
        gr.Markdown("<hr style='border: 1px solid #8ecae6;'/>")

        # Processing the audio input
        def process_audio(audio, volume):
            logging.info(f"Received audio: {audio}")
            if audio is None:
                return "No audio detected. Please try recording again.", None
            response, audio_path = transcribe_and_chat(audio)
            # Adjust volume for the response audio
            adjusted_audio_path = asyncio.run(text_to_speech_stream(response, volume))
            logging.info(f"Response: {response}, Audio path: {adjusted_audio_path}")
            return response, adjusted_audio_path

        audio_input.change(process_audio, inputs=[audio_input, voice_volume], outputs=[chat_output, audio_output])
        clear_button.click(lambda: (None, None), None, [chat_output, audio_output])

        # JavaScript to handle autoplay, automatic submission, and auto-listen
        demo.load(None, js="""
            function() {
                var recordButton;
                
                function findRecordButton() {
                    var buttons = document.querySelectorAll('button');
                    for (var i = 0; i < buttons.length; i++) {
                        if (buttons[i].textContent.includes('Record from microphone')) {
                            return buttons[i];
                        }
                    }
                    return null;
                }

                function startListening() {
                    if (!recordButton) {
                        recordButton = findRecordButton();
                    }
                    if (recordButton) {
                        recordButton.click();
                    }
                }

                document.querySelector("audio").addEventListener("ended", function() {
                    setTimeout(startListening, 500);
                });
                
                function playAssistantAudio() {
                    var audioElements = document.querySelectorAll('audio');
                    if (audioElements.length > 1) {
                        var assistantAudio = audioElements[1];
                        if (assistantAudio) {
                            assistantAudio.play();
                        }
                    }
                }

                document.addEventListener('gradioAudioLoaded', function(event) {
                    playAssistantAudio();
                });

                document.addEventListener('gradioUpdated', function(event) {
                    setTimeout(playAssistantAudio, 100);
                });

                // Prevent audio from stopping when switching tabs
                document.addEventListener("visibilitychange", function() {
                    var audioElements = document.querySelectorAll('audio');
                    audioElements.forEach(function(audio) {
                        audio.play();
                    });
                });
            }
        """)

    return demo

# Launch the Gradio app
if __name__ == "__main__":
    demo = create_demo()
    demo.launch(server_name="0.0.0.0", server_port=7860)