Voice-Chat-AI

Build error

App Files Files Community

KingNish commited on Jun 5, 2024

Commit

5c47ebc

verified ·

1 Parent(s): 29ebca1

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -41

app.py CHANGED Viewed

@@ -12,13 +12,6 @@ from huggingface_hub import hf_hub_download, InferenceClient
 import requests
 from bs4 import BeautifulSoup
 import urllib
-import random
-from functools import lru_cache
-import concurrent.futures
-# Configuration for concurrency
-MAX_WORKERS = 4  # Adjust based on your system resources
-executor = concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS)
 def extract_text_from_webpage(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
@@ -37,7 +30,7 @@ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="activ
     start = 0
     all_results = []
     # Limit the number of characters from each webpage to stay under the token limit
-    max_chars_per_page = 4000  # Adjust this value based on your token limit and average webpage length
     with requests.Session() as session:
         while start < num_results:
@@ -71,29 +64,24 @@ def search(term, num_results=3, lang="en", advanced=True, timeout=5, safe="activ
                         # Truncate text if it's too long
                         if len(visible_text) > max_chars_per_page:
                             visible_text = visible_text[:max_chars_per_page] + "..."
-                        all_results.append({"link": link, "text": visible_text})
                     except requests.exceptions.RequestException as e:
                         print(f"Error fetching or processing {link}: {e}")
-                        all_results.append({"link": link, "text": None})
                 else:
-                    all_results.append({"link": None, "text": None})
             start += len(result_block)
     return all_results
-@lru_cache(maxsize=1)  # Cache the models to avoid reloading
-def load_speech_recognition_models():
-    """Loads and caches speech recognition models."""
-    model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
-    sample_rate = 16000
-    preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
-    encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
-    tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
-    return preprocessor, encoder, tokenizer
 # Speech Recognition Model Configuration
 model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
 sample_rate = 16000
 # Mistral Model Configuration
 client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 system_instructions1 = "<s>[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. The request asks you to provide friendly responses. The expectation is that I will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
@@ -105,8 +93,6 @@ def to_float32(audio_buffer):
     return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
 def transcribe(audio_path):
-    """Transcribes audio using cached models."""
-    preprocessor, encoder, tokenizer = load_speech_recognition_models()
     audio_file = AudioSegment.from_file(audio_path)
     sr = audio_file.frame_rate
     audio_buffer = np.array(audio_file.get_array_of_samples())
@@ -126,38 +112,34 @@ def transcribe(audio_path):
     return text
-async def run_model(text, web_search):
-    """Runs the language model asynchronously."""
-    if web_search:
-        web_results = await asyncio.get_event_loop().run_in_executor(executor, search, text)  # Run search in executor
-        web2 = ' '.join([f"Link: {res['link']}\nText: {res['text']}\n\n" for res in web_results])
         formatted_prompt = system_instructions1 + text + "[WEB]" + str(web2) + "[ANSWER]"
     else:
         formatted_prompt = system_instructions1 + text + "[JARVIS]"
-    stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
-    return "".join([response.token.text for response in stream if response.token.text != "</s>"])
-async def generate_speech(reply):
-    """Generates speech asynchronously."""
     communicate = edge_tts.Communicate(reply)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
     return tmp_path
-async def respond(audio, web_search):
-    """Handles user input, model processing, and response generation."""
-    user = await asyncio.get_event_loop().run_in_executor(executor, transcribe, audio)  # Run transcription in executor
-    reply = await run_model(user, web_search)
-    audio_path = await generate_speech(reply)
-    return audio_path
 with gr.Blocks() as demo:
     with gr.Row():
         web_search = gr.Checkbox(label="Web Search", value=False)
-        input = gr.Audio(label="Voice Chat", sources="microphone", type="numpy")
         output = gr.Audio(label="AI",autoplay=True)
-        gr.Interface(fn=respond, inputs=[input, web_search], outputs=[output], live=True)
 if __name__ == "__main__":
     demo.queue(max_size=200).launch()

 import requests
 from bs4 import BeautifulSoup
 import urllib
 def extract_text_from_webpage(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
     start = 0
     all_results = []
     # Limit the number of characters from each webpage to stay under the token limit
+    max_chars_per_page = 3000  # Adjust this value based on your token limit and average webpage length
     with requests.Session() as session:
         while start < num_results:
                         # Truncate text if it's too long
                         if len(visible_text) > max_chars_per_page:
                             visible_text = visible_text[:max_chars_per_page] + "..."
+                        all_results.append({"text": visible_text})
                     except requests.exceptions.RequestException as e:
                         print(f"Error fetching or processing {link}: {e}")
+                        all_results.append({"text": None})
                 else:
+                    all_results.append({"text": None})
             start += len(result_block)
     return all_results
 # Speech Recognition Model Configuration
 model_name = "neongeckocom/stt_en_citrinet_512_gamma_0_25"
 sample_rate = 16000
+# Download preprocessor, encoder and tokenizer
+preprocessor = torch.jit.load(hf_hub_download(model_name, "preprocessor.ts", subfolder="onnx"))
+encoder = ort.InferenceSession(hf_hub_download(model_name, "model.onnx", subfolder="onnx"))
+tokenizer = spm.SentencePieceProcessor(hf_hub_download(model_name, "tokenizer.spm", subfolder="onnx"))
 # Mistral Model Configuration
 client1 = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 system_instructions1 = "<s>[SYSTEM] Answer as Real OpenGPT 4o, Made by 'KingNish', Keep conversation very short, clear, friendly and concise. The text provided is a request for a specific type of response from you, the virtual assistant. The request asks you to provide friendly responses. The expectation is that I will avoid introductions and start answering the query directly, elaborating on all aspects of the request. As an AI-powered assistant, your task is to generate responses that appear as if they are created by the user and not an AI-powered assistant. [USER]"
     return np.divide(audio_buffer, np.iinfo(audio_buffer.dtype).max, dtype=np.float32)
 def transcribe(audio_path):
     audio_file = AudioSegment.from_file(audio_path)
     sr = audio_file.frame_rate
     audio_buffer = np.array(audio_file.get_array_of_samples())
     return text
+def model(text, web_search):
+    if web_search is True:
+        """Performs a web search, feeds the results to a language model, and returns the answer."""
+        web_results = search(text)
+        web2 = ' '.join([f"Text: {res['text']}\n\n" for res in web_results])
         formatted_prompt = system_instructions1 + text + "[WEB]" + str(web2) + "[ANSWER]"
+        stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
+        return "".join([response.token.text for response in stream if response.token.text != "</s>"])
     else:
         formatted_prompt = system_instructions1 + text + "[JARVIS]"
+        stream = client1.text_generation(formatted_prompt, max_new_tokens=512, stream=True, details=True, return_full_text=False)
+        return "".join([response.token.text for response in stream if response.token.text != "</s>"])
+async def respond(audio, web_search):
+    user = transcribe(audio)
+    reply = model(user, web_search)
     communicate = edge_tts.Communicate(reply)
     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
         tmp_path = tmp_file.name
         await communicate.save(tmp_path)
     return tmp_path
 with gr.Blocks() as demo:
     with gr.Row():
         web_search = gr.Checkbox(label="Web Search", value=False)
+        input = gr.Audio(label="Voice Chat", sources="microphone")
         output = gr.Audio(label="AI",autoplay=True)
+        gr.Interface(fn=respond, inputs=[input, web_search], outputs=[output], live=True, batch=True, max_batch_size=20, delete_cache=(60,60))
 if __name__ == "__main__":
     demo.queue(max_size=200).launch()