KingNish commited on
Commit
908ca8a
·
verified ·
1 Parent(s): e0d842c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -18
app.py CHANGED
@@ -13,6 +13,12 @@ import requests
13
  from bs4 import BeautifulSoup
14
  import urllib
15
  import random
 
 
 
 
 
 
16
 
17
  # List of user agents to choose from for requests
18
  _useragent_list = [
@@ -89,15 +95,20 @@ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="activ
89
  start += len(result_block)
90
  return all_results
91
 
 
 
 
 
 
 
 
 
 
 
92
  # Speech Recognition Model Configuration
93
  model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
94
  sample_rate = 16000
95
 
96
- # Download preprocessor, encoder and tokenizer
97
- preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
98
- encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
99
- tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
100
-
101
  # Mistral Model Configuration
102
  client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
103
  system_instructions1 = "<s>[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. The request asks you to provide friendly responses. The expectation is that I will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
@@ -109,6 +120,8 @@ def to_float32(audio_buffer):
109
  return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
110
 
111
  def transcribe(audio_path):
 
 
112
  audio_file = AudioSegment.from_file(audio_path)
113
  sr = audio_file.frame_rate
114
  audio_buffer = np.array(audio_file.get_array_of_samples())
@@ -128,34 +141,38 @@ def transcribe(audio_path):
128
 
129
  return text
130
 
131
- def model(text, web_search):
132
- if web_search is True:
133
- """Performs a web search, feeds the results to a language model, and returns the answer."""
134
- web_results = search(text)
135
  web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
136
  formatted_prompt = system_instructions1 + text + "[WEB]" + str(web2) + "[ANSWER]"
137
- stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
138
- return "".join([response.token.text for response in stream if response.token.text != "</s>"])
139
  else:
140
  formatted_prompt = system_instructions1 + text + "[JARVIS]"
141
- stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
142
- return "".join([response.token.text for response in stream if response.token.text != "</s>"])
143
 
144
- async def respond(audio, web_search):
145
- user = transcribe(audio)
146
- reply = model(user, web_search)
147
  communicate = edge_tts.Communicate(reply)
148
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
149
  tmp_path = tmp_file.name
150
  await communicate.save(tmp_path)
151
  return tmp_path
152
 
 
 
 
 
 
 
 
153
  with gr.Blocks() as demo:
154
  with gr.Row():
155
  web_search = gr.Checkbox(label="Web Search", value=False)
156
- input = gr.Audio(label="Voice Chat", sources="microphone")
157
  output = gr.Audio(label="AI",autoplay=True)
158
- gr.Interface(fn=respond, inputs=[input, web_search], outputs=[output], live=True, batch=True, max_batch_size=20, delete_cache=(60,60))
159
 
160
  if __name__ == "__main__":
161
  demo.queue(max_size=200).launch()
 
13
  from bs4 import BeautifulSoup
14
  import urllib
15
  import random
16
+ from functools import lru_cache
17
+ import concurrent.futures
18
+
19
+ # Configuration for concurrency
20
+ MAX_WORKERS = 4 # Adjust based on your system resources
21
+ executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
22
 
23
  # List of user agents to choose from for requests
24
  _useragent_list = [
 
95
  start += len(result_block)
96
  return all_results
97
 
98
+ @lru_cache(maxsize=1) # Cache the models to avoid reloading
99
+ def load_speech_recognition_models():
100
+ """Loads and caches speech recognition models."""
101
+ model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
102
+ sample_rate = 16000
103
+ preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
104
+ encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
105
+ tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
106
+ return preprocessor, encoder, tokenizer
107
+
108
  # Speech Recognition Model Configuration
109
  model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
110
  sample_rate = 16000
111
 
 
 
 
 
 
112
  # Mistral Model Configuration
113
  client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
114
  system_instructions1 = "<s>[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. The request asks you to provide friendly responses. The expectation is that I will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
 
120
  return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
121
 
122
  def transcribe(audio_path):
123
+ """Transcribes audio using cached models."""
124
+ preprocessor, encoder, tokenizer = load_speech_recognition_models()
125
  audio_file = AudioSegment.from_file(audio_path)
126
  sr = audio_file.frame_rate
127
  audio_buffer = np.array(audio_file.get_array_of_samples())
 
141
 
142
  return text
143
 
144
+ async def run_model(text, web_search):
145
+ """Runs the language model asynchronously."""
146
+ if web_search:
147
+ web_results = await asyncio.get_event_loop().run_in_executor(executor, search, text) # Run search in executor
148
  web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
149
  formatted_prompt = system_instructions1 + text + "[WEB]" + str(web2) + "[ANSWER]"
 
 
150
  else:
151
  formatted_prompt = system_instructions1 + text + "[JARVIS]"
152
+ stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
153
+ return "".join([response.token.text for response in stream if response.token.text != "</s>"])
154
 
155
+ async def generate_speech(reply):
156
+ """Generates speech asynchronously."""
 
157
  communicate = edge_tts.Communicate(reply)
158
  with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
159
  tmp_path = tmp_file.name
160
  await communicate.save(tmp_path)
161
  return tmp_path
162
 
163
+ async def respond(audio, web_search):
164
+ """Handles user input, model processing, and response generation."""
165
+ user = await asyncio.get_event_loop().run_in_executor(executor, transcribe, audio) # Run transcription in executor
166
+ reply = await run_model(user, web_search)
167
+ audio_path = await generate_speech(reply)
168
+ return audio_path
169
+
170
  with gr.Blocks() as demo:
171
  with gr.Row():
172
  web_search = gr.Checkbox(label="Web Search", value=False)
173
+ input = gr.Audio(label="Voice Chat", sources="microphone", type="numpy")
174
  output = gr.Audio(label="AI",autoplay=True)
175
+ gr.Interface(fn=respond, inputs=[input, web_search], outputs=[output], live=True)
176
 
177
  if __name__ == "__main__":
178
  demo.queue(max_size=200).launch()