ar08 commited on
Commit
3bcf6d8
·
verified ·
1 Parent(s): e2b03c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -60
app.py CHANGED
@@ -1,29 +1,26 @@
1
  import gradio as gr
2
  import asyncio
3
  import edge_tts
4
- import speech_recognition as sr
5
- from pydub import AudioSegment
6
- from pydub.playback import play
7
  import os
8
  from huggingface_hub import InferenceClient
9
  import whisper
10
  import torch
11
- from io import BytesIO
12
  import tempfile
 
 
 
 
13
 
14
  # Get the Hugging Face token from environment variable
15
- hf_token = os.environ.get("HF_TOKEN")
16
  if not hf_token:
17
  raise ValueError("HF_TOKEN environment variable is not set")
18
 
19
  # Initialize the Hugging Face Inference Client
20
- client = InferenceClient(
21
- "mistralai/Mistral-Nemo-Instruct-2407",
22
- token=hf_token
23
- )
24
 
25
  # Load the Whisper model
26
- whisper_model = whisper.load_model("tiny.en", device='cpu')
27
 
28
  # Initialize an empty chat history
29
  chat_history = []
@@ -37,7 +34,6 @@ async def text_to_speech_stream(text):
37
  if chunk["type"] == "audio":
38
  audio_data += chunk["data"]
39
 
40
- # Save the audio data to a temporary file
41
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
42
  temp_file.write(audio_data)
43
  return temp_file.name
@@ -46,23 +42,20 @@ def whisper_speech_to_text(audio):
46
  """Convert speech to text using Whisper model."""
47
  try:
48
  result = whisper_model.transcribe(audio)
49
- text = result['text']
50
- return text
51
  except Exception as e:
52
  print(f"Whisper Error: {e}")
53
  return None
54
  finally:
55
- # Clear CUDA cache
56
- torch.cuda.empty_cache()
57
 
58
- async def chat_with_ai(message, history):
59
  global chat_history
60
 
61
- # Add user message to chat history
62
  chat_history.append({"role": "user", "content": message})
63
 
64
  try:
65
- # Send chat completion request
66
  response = client.chat_completion(
67
  messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
68
  max_tokens=800,
@@ -70,11 +63,8 @@ async def chat_with_ai(message, history):
70
  )
71
 
72
  response_text = response.choices[0].message['content']
73
-
74
- # Add assistant's response to chat history
75
  chat_history.append({"role": "assistant", "content": response_text})
76
 
77
- # Generate speech for the response
78
  audio_path = await text_to_speech_stream(response_text)
79
 
80
  return response_text, audio_path
@@ -83,52 +73,47 @@ async def chat_with_ai(message, history):
83
  return str(e), None
84
 
85
  def transcribe_and_chat(audio):
86
- # Transcribe audio to text
87
  text = whisper_speech_to_text(audio)
88
  if text is None:
89
  return "Sorry, I couldn't understand the audio.", None
90
 
91
- # Chat with AI using the transcribed text
92
- response, audio_path = asyncio.run(chat_with_ai(text, []))
93
  return response, audio_path
94
 
95
  # Define the Gradio interface
96
- with gr.Blocks() as demo:
97
- gr.Markdown("# AI Voice Assistant")
98
-
99
- with gr.Row():
100
- with gr.Column():
101
- audio_input = gr.Audio(type="filepath", label="Speak here", interactive=False)
102
- text_input = gr.Textbox(label="Or type your message here")
103
-
104
- with gr.Column():
105
- chat_output = gr.Textbox(label="AI Response")
106
- audio_output = gr.Audio(label="AI Voice Response", interactive=False)
107
-
108
- audio_button = gr.Button("Send Audio")
109
- text_button = gr.Button("Send Text")
110
-
111
- # Add custom JavaScript to handle spacebar press and play audio automatically
112
- demo.append(gr.HTML("""
113
- <script>
114
- document.addEventListener('keydown', function(event) {
115
- if (event.code === 'Space') {
116
- document.querySelector('input[type="file"]').click();
117
- }
118
- });
119
-
120
- document.addEventListener('gradioAudioLoaded', function(event) {
121
- var audioElement = document.querySelector('audio');
122
- if (audioElement) {
123
- audioElement.play();
124
- }
125
- });
126
- </script>
127
- """))
128
-
129
- audio_button.click(transcribe_and_chat, inputs=audio_input, outputs=[chat_output, audio_output])
130
- text_button.click(lambda x: asyncio.run(chat_with_ai(x, [])), inputs=text_input, outputs=[chat_output, audio_output])
131
 
132
  # Launch the Gradio app
133
  if __name__ == "__main__":
134
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
1
  import gradio as gr
2
  import asyncio
3
  import edge_tts
 
 
 
4
  import os
5
  from huggingface_hub import InferenceClient
6
  import whisper
7
  import torch
 
8
  import tempfile
9
+ from dotenv import load_dotenv
10
+
11
+ # Load environment variables
12
+ load_dotenv()
13
 
14
  # Get the Hugging Face token from environment variable
15
+ hf_token = os.getenv("HF_TOKEN")
16
  if not hf_token:
17
  raise ValueError("HF_TOKEN environment variable is not set")
18
 
19
  # Initialize the Hugging Face Inference Client
20
+ client = InferenceClient("mistralai/Mistral-Nemo-Instruct-2407", token=hf_token)
 
 
 
21
 
22
  # Load the Whisper model
23
+ whisper_model = whisper.load_model("tiny.en", device='cuda' if torch.cuda.is_available() else 'cpu')
24
 
25
  # Initialize an empty chat history
26
  chat_history = []
 
34
  if chunk["type"] == "audio":
35
  audio_data += chunk["data"]
36
 
 
37
  with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as temp_file:
38
  temp_file.write(audio_data)
39
  return temp_file.name
 
42
  """Convert speech to text using Whisper model."""
43
  try:
44
  result = whisper_model.transcribe(audio)
45
+ return result['text']
 
46
  except Exception as e:
47
  print(f"Whisper Error: {e}")
48
  return None
49
  finally:
50
+ if torch.cuda.is_available():
51
+ torch.cuda.empty_cache()
52
 
53
+ async def chat_with_ai(message):
54
  global chat_history
55
 
 
56
  chat_history.append({"role": "user", "content": message})
57
 
58
  try:
 
59
  response = client.chat_completion(
60
  messages=[{"role": "system", "content": "You are a helpful voice assistant. Provide concise and clear responses to user queries."}] + chat_history,
61
  max_tokens=800,
 
63
  )
64
 
65
  response_text = response.choices[0].message['content']
 
 
66
  chat_history.append({"role": "assistant", "content": response_text})
67
 
 
68
  audio_path = await text_to_speech_stream(response_text)
69
 
70
  return response_text, audio_path
 
73
  return str(e), None
74
 
75
  def transcribe_and_chat(audio):
 
76
  text = whisper_speech_to_text(audio)
77
  if text is None:
78
  return "Sorry, I couldn't understand the audio.", None
79
 
80
+ response, audio_path = asyncio.run(chat_with_ai(text))
 
81
  return response, audio_path
82
 
83
  # Define the Gradio interface
84
+ def create_demo():
85
+ with gr.Blocks() as demo:
86
+ gr.Markdown("# AI Voice Assistant")
87
+
88
+ with gr.Row():
89
+ with gr.Column(scale=1):
90
+ audio_input = gr.Audio(source="microphone", type="filepath", label="Press 'Record' to Speak")
91
+
92
+ with gr.Column(scale=1):
93
+ chat_output = gr.Textbox(label="AI Response")
94
+ audio_output = gr.Audio(label="AI Voice Response")
95
+
96
+ demo.load(None, js="""
97
+ function() {
98
+ document.querySelector("audio").addEventListener("stop", function() {
99
+ setTimeout(function() {
100
+ document.querySelector('button[title="Submit"]').click();
101
+ }, 500);
102
+ });
103
+ document.addEventListener('gradioAudioLoaded', function(event) {
104
+ var audioElement = document.querySelector('audio');
105
+ if (audioElement) {
106
+ audioElement.play();
107
+ }
108
+ });
109
+ }
110
+ """)
111
+
112
+ audio_input.change(transcribe_and_chat, inputs=audio_input, outputs=[chat_output, audio_output])
113
+
114
+ return demo
 
 
 
 
115
 
116
  # Launch the Gradio app
117
  if __name__ == "__main__":
118
+ demo = create_demo()
119
+ demo.launch(server_name="0.0.0.0", server_port=7860)