Spaces:

pratham0011
/

ConversAI_AI-Voice-Chat-Assistant

Sleeping

App Files Files Community

pratham0011 commited on Jan 20

Commit

252fde6

verified ·

1 Parent(s): 3e4baba

Upload 6 files

Browse files

Files changed (5) hide show

app.py +103 -96
requirements.txt +0 -0
services/qwen.py +16 -18
services/search.py +121 -85
services/whisper.py +44 -20

app.py CHANGED Viewed

@@ -1,96 +1,103 @@
-import asyncio
-import logging
-import gradio as gr
-from services.qwen import respond
-logger = logging.getLogger(__name__)
-# Track conversation state
-conversation_history = []
-def clear_conversation():
-    global conversation_history
-    conversation_history = []
-    return [],None
-def sync_respond(audio, text_input, do_search, history):
-    if not audio and not text_input:
-        return None, history
-    logger.info(f"Processing request with search enabled: {do_search}")
-    result = asyncio.run(respond(audio, text_input, do_search, history))
-    audio_path, response_text = result
-    if audio:
-        user_message = {"role": "user", "content": "Voice message"}
-    else:
-        user_message = {"role": "user", "content": text_input}
-    assistant_message = {"role": "assistant", "content": response_text}
-    history.extend([user_message, assistant_message])
-    return audio_path, history
-# Build Gradio interface
-with gr.Blocks(theme=gr.themes.Soft()) as interface:
-    gr.Markdown(
-        """
-        <div style="text-align: center; margin-bottom: 1rem;">
-            <h1 style="font-weight: bold;">ConversAI: AI Voice & Chat Assistant</h1>
-        </div>
-        """,
-        show_label=False
-    )
-    # Input components (left column)
-    with gr.Row():
-        with gr.Column(scale=2):
-            audio_input = gr.Audio(
-                label="Your Voice Input",
-                type="filepath",
-                sources=["microphone"]
-            )
-            text_input = gr.Textbox(
-                label="Or Type Your Message",
-                placeholder="Type here..."
-            )
-            search_checkbox = gr.Checkbox(
-                label="Enable web search",
-                value=False
-                )
-            clear_btn = gr.Button("Clear Chat")
-        # Output components (right column)
-        with gr.Column(scale=3):
-            chatbot = gr.Chatbot(label="Conversation", type="messages")
-            audio_output = gr.Audio(
-                label="AI Voice Response",
-                type="filepath",
-                autoplay=True
-            )
-    # Define input event handlers
-    input_events = [
-        audio_input.change(
-            fn=sync_respond,
-            inputs=[audio_input, text_input,search_checkbox, chatbot],
-            outputs=[audio_output, chatbot]
-        ),
-        text_input.submit(
-            fn=sync_respond,
-            inputs=[audio_input, text_input, search_checkbox, chatbot],
-            outputs=[audio_output, chatbot]
-        )
-    ]
-    # Clear chat button handler
-    clear_btn.click(
-        fn=clear_conversation,
-        outputs=[chatbot, audio_output]
-    )
-# Start server
-if __name__ == "__main__":
-    interface.launch(debug=True)

+import asyncio
+import logging
+import gradio as gr
+from services.qwen import respond
+logger = logging.getLogger(__name__)
+# Track conversation state
+conversation_history = []
+def clear_conversation():
+    global conversation_history
+    conversation_history = []
+    return [],None
+def sync_respond(audio, text_input, do_search, history):
+    if not audio and not text_input:
+        return None, history
+    logger.info(f"Processing request with search enabled: {do_search}")
+    result = asyncio.run(respond(audio, text_input, do_search, history))
+    audio_path, response_text = result
+    if audio:
+        user_message = {"role": "user", "content": "Voice message"}
+    else:
+        user_message = {"role": "user", "content": text_input}
+    assistant_message = {"role": "assistant", "content": response_text}
+    history.extend([user_message, assistant_message])
+    return audio_path, history
+# Build Gradio interface
+with gr.Blocks(
+    theme=gr.themes.Soft(),
+    css=""".message { font-family: "Times New Roman", Times, serif !important;}"""
+    ) as interface:
+    gr.Markdown(
+        """
+        <div style="text-align: center; margin-bottom: 1rem;">
+            <h1 style="font-weight: bold;">ConversAI: AI Voice & Chat Assistant</h1>
+        </div>
+        """,
+        show_label=False
+    )
+    # Input components (left column)
+    with gr.Row():
+        with gr.Column(scale=2):
+            audio_input = gr.Audio(
+                label="Your Voice Input",
+                type="filepath",
+                sources=["microphone"]
+            )
+            text_input = gr.Textbox(
+                label="Or Type Your Message",
+                placeholder="Type here..."
+            )
+            search_checkbox = gr.Checkbox(
+                label="Enable web search",
+                value=False
+                )
+            clear_btn = gr.Button("Clear Chat")
+        # Output components (right column)
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(label="Conversation", type="messages")
+            audio_output = gr.Audio(
+                label="AI Voice Response",
+                type="filepath",
+                autoplay=True
+            )
+    # Define input event handlers
+    input_events = [
+        audio_input.change(
+            fn=sync_respond,
+            inputs=[audio_input, text_input,search_checkbox, chatbot],
+            outputs=[audio_output, chatbot]
+        ),
+        text_input.submit(
+            fn=sync_respond,
+            inputs=[audio_input, text_input, search_checkbox, chatbot],
+            outputs=[audio_output, chatbot]
+        )
+    ]
+    # Clear chat button handler
+    clear_btn.click(
+        fn=clear_conversation,
+        outputs=[chatbot, audio_output]
+    )
+# Start server
+if __name__ == "__main__":
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        debug=True
+    )

requirements.txt CHANGED Viewed

Binary files a/requirements.txt and b/requirements.txt differ

services/qwen.py CHANGED Viewed

@@ -2,10 +2,10 @@ import logging
 from typing import List, Dict, Optional, Tuple
 import torch
-# from transformers import pipeline
-from huggingface_hub import InferenceClient
-from config.config import token, SYSTEM_PROMPT
 from services.whisper import generate_speech, transcribe
 from services.search import WebSearcher
@@ -19,12 +19,13 @@ model_kwargs = {
     "torch_dtype": torch.float32,
     'use_cache': True
 }
-client = InferenceClient(
     model="Qwen/Qwen2.5-0.5B-Instruct",
-    token=token
-    # trust_remote_code=True,
-    # device=device,
-    # model_kwargs=model_kwargs
 )
 async def respond(
@@ -64,27 +65,24 @@ async def respond(
             if results:
                 search_context = "Based on search results:\n"
                 for result in results:
-                    snippet = result['content'][:5000].strip()
                     search_context += f"{snippet}\n"
                 prompt = prompt.replace(SYSTEM_PROMPT, f"{SYSTEM_PROMPT}\n{search_context}")
         # Generate response
-        reply = client.text_generation(
             prompt,
-            max_new_tokens=300,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
-            return_full_text=False
         )
         # Extract and clean assistant response
-        assistant_response = reply  # Reply is already the generated text string
-        if "<|im_start|>assistant\n" in assistant_response:
-            assistant_response = assistant_response.split("<|im_start|>assistant\n")[-1]
-        if "<|im_end|>" in assistant_response:
-            assistant_response = assistant_response.split("<|im_end|>")[0]
-        assistant_response = assistant_response.strip()
         # Convert response to speech
         audio_path = await generate_speech(assistant_response)

 from typing import List, Dict, Optional, Tuple
 import torch
+from transformers import pipeline
+from transformers import pipeline
+from config.config import token, device, SYSTEM_PROMPT
 from services.whisper import generate_speech, transcribe
 from services.search import WebSearcher
     "torch_dtype": torch.float32,
     'use_cache': True
 }
+client = pipeline(
+    "text-generation",
     model="Qwen/Qwen2.5-0.5B-Instruct",
+    token=token,
+    trust_remote_code=True,
+    device=device,
+    model_kwargs=model_kwargs
 )
 async def respond(
             if results:
                 search_context = "Based on search results:\n"
                 for result in results:
+                    snippet = result['content'][:500].strip()
                     search_context += f"{snippet}\n"
                 prompt = prompt.replace(SYSTEM_PROMPT, f"{SYSTEM_PROMPT}\n{search_context}")
         # Generate response
+        reply = client(
             prompt,
+            max_new_tokens=400,
             do_sample=True,
             temperature=0.7,
             top_p=0.9,
+            num_return_sequences=1
         )
         # Extract and clean assistant response
+        assistant_response = reply[0]['generated_text']
+        assistant_response = assistant_response.split("<|im_start|>assistant\n")[-1]
+        assistant_response = assistant_response.split("<|im_end|>")[0].strip()
         # Convert response to speech
         audio_path = await generate_speech(assistant_response)

services/search.py CHANGED Viewed

@@ -1,85 +1,121 @@
-import logging
-from typing import List, Dict
-import requests
-from bs4 import BeautifulSoup
-from urllib3.exceptions import InsecureRequestWarning
-# Disable SSL warnings for requests
-requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
-logger = logging.getLogger(__name__)
-class WebSearcher:
-    def __init__(self):
-        self.headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
-        }
-    def extract_text(self, html_content: str) -> str:
-        soup = BeautifulSoup(html_content, 'html.parser')
-        # Remove unwanted elements
-        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
-            element.decompose()
-        text = ' '.join(soup.stripped_strings)
-        return text[:8000]  # Limit text length
-    def search(self, query: str, max_results: int = 3) -> List[Dict]:
-        results = []
-        try:
-            with requests.Session() as session:
-                # Google search parameters
-                search_url = "https://www.google.com/search"
-                params = {
-                    "q": query,
-                    "num": max_results,
-                    "hl": "en"
-                }
-                response = session.get(
-                    search_url,
-                    headers=self.headers,
-                    params=params,
-                    timeout=10,
-                    verify=False
-                )
-                response.raise_for_status()
-                # Parse search results
-                soup = BeautifulSoup(response.text, 'html.parser')
-                search_results = soup.select('div.g')
-                for result in search_results[:max_results]:
-                    link = result.find('a')
-                    if not link:
-                        continue
-                    url = link.get('href', '')
-                    if not url.startswith('http'):
-                        continue
-                    try:
-                        # Fetch webpage content
-                        page_response = session.get(
-                            url,
-                            headers=self.headers,
-                            timeout=5,
-                            verify=False
-                        )
-                        page_response.raise_for_status()
-                        content = self.extract_text(page_response.text)
-                        results.append({
-                            "url": url,
-                            "content": content
-                        })
-                        logger.info(f"Successfully fetched content from {url}")
-                    except Exception as e:
-                        logger.warning(f"Failed to fetch {url}: {str(e)}")
-                        continue
-        except Exception as e:
-            logger.error(f"Search failed: {str(e)}")
-        return results[:max_results]

+# import logging
+# from typing import List, Dict
+# import requests
+# from bs4 import BeautifulSoup
+# from urllib3.exceptions import InsecureRequestWarning
+# # Disable SSL warnings for requests
+# requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+# logger = logging.getLogger(__name__)
+# class WebSearcher:
+#     def __init__(self):
+#         self.headers = {
+#             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
+#         }
+#     def extract_text(self, html_content: str) -> str:
+#         soup = BeautifulSoup(html_content, 'html.parser')
+#         # Remove unwanted elements
+#         for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
+#             element.decompose()
+#         text = ' '.join(soup.stripped_strings)
+#         return text[:8000]  # Limit text length
+#     def search(self, query: str, max_results: int = 3) -> List[Dict]:
+#         results = []
+#         try:
+#             with requests.Session() as session:
+#                 # Google search parameters
+#                 search_url = "https://www.google.com/search"
+#                 params = {
+#                     "q": query,
+#                     "num": max_results,
+#                     "hl": "en"
+#                 }
+#                 response = session.get(
+#                     search_url,
+#                     headers=self.headers,
+#                     params=params,
+#                     timeout=10,
+#                     verify=False
+#                 )
+#                 response.raise_for_status()
+#                 # Parse search results
+#                 soup = BeautifulSoup(response.text, 'html.parser')
+#                 search_results = soup.select('div.g')
+#                 for result in search_results[:max_results]:
+#                     link = result.find('a')
+#                     if not link:
+#                         continue
+#                     url = link.get('href', '')
+#                     if not url.startswith('http'):
+#                         continue
+#                     try:
+#                         # Fetch webpage content
+#                         page_response = session.get(
+#                             url,
+#                             headers=self.headers,
+#                             timeout=5,
+#                             verify=False
+#                         )
+#                         page_response.raise_for_status()
+#                         content = self.extract_text(page_response.text)
+#                         results.append({
+#                             "url": url,
+#                             "content": content
+#                         })
+#                         logger.info(f"Successfully fetched content from {url}")
+#                     except Exception as e:
+#                         logger.warning(f"Failed to fetch {url}: {str(e)}")
+#                         continue
+#         except Exception as e:
+#             logger.error(f"Search failed: {str(e)}")
+#         return results[:max_results]
+import logging
+from typing import List, Dict
+from transformers.agents import DuckDuckGoSearchTool
+logger = logging.getLogger(__name__)
+class WebSearcher:
+    def __init__(self):
+        self.search_tool = DuckDuckGoSearchTool()
+    def search(self, query: str) -> List[Dict]:
+        try:
+            # Execute search
+            search_results = self.search_tool(query)
+            # Convert list to string if necessary
+            if isinstance(search_results, list):
+                search_results = ' '.join(str(result) for result in search_results)
+            results = [{
+                "url": "duckduckgo_search",
+                "content": str(search_results) # Limit content length and ensure string
+            }]
+            return results
+        except Exception as e:
+            logger.error(f"Search error: {str(e)}")
+            return []
+# Initialize searcher
+searcher = WebSearcher()

services/whisper.py CHANGED Viewed

@@ -1,19 +1,29 @@
 import os
 import tempfile
 import logging
-import requests
 from typing import Optional
 import edge_tts
-from config.config import VOICE, FALLBACK_VOICES, token
 logger = logging.getLogger(__name__)
 # Whisper model for speech to text
-API_URL = "https://api-inference.huggingface.co/models/openai/whisper-tiny"
-headers = {"Authorization": f"Bearer {token}"}
 # Voice selection handling
 async def get_valid_voice() -> str:
@@ -49,20 +59,34 @@ async def generate_speech(text: str) -> Optional[str]:
 # Speech-to-text using Whisper
 async def transcribe(audio_file: str) -> str:
-    try:
-        with open(audio_file, "rb") as f:
-            data = f.read()
-        response = requests.post(API_URL, headers=headers, data=data)
-        result = response.json()
-        if "text" in result:
-            transcription = result["text"].strip()
-            logger.info(f"Transcribed text: {transcription}")
-            return transcription
-        else:
-            raise ValueError("No transcription in response")
-    except Exception as e:
-        logger.error(f"Transcription error: {str(e)}")
-        raise RuntimeError(f"Failed to transcribe audio: {str(e)}")

 import os
 import tempfile
 import logging
 from typing import Optional
+import torch
+import librosa
 import edge_tts
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from config.config import VOICE, FALLBACK_VOICES
 logger = logging.getLogger(__name__)
 # Whisper model for speech to text
+processor = WhisperProcessor.from_pretrained(
+    "openai/whisper-tiny",
+    local_files_only=False
+)
+model = WhisperForConditionalGeneration.from_pretrained(
+    "openai/whisper-tiny",
+    local_files_only=False,
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float32,
+).to("cpu")
 # Voice selection handling
 async def get_valid_voice() -> str:
 # Speech-to-text using Whisper
 async def transcribe(audio_file: str) -> str:
+    audio, sr = librosa.load(
+        audio_file,
+        sr=16000,
+        mono=True,
+        duration=30
+    )
+    inputs = processor(
+        audio,
+        sampling_rate=sr,
+        return_tensors="pt",
+        return_attention_mask=True
+    ).to(model.device)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            input_features=inputs.input_features,
+            attention_mask=inputs.attention_mask,
+            language="en",
+            task="transcribe",
+            max_length=448,
+            temperature=0.0
+        )
+        transcription = processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True
+        )[0].strip()
+    logger.info(f"Transcribed text: {transcription}")
+    return transcription