Spaces:

pratham0011
/

ConversAI_AI-Voice-Chat-Assistant

Sleeping

App Files Files Community

pratham0011 commited on Jan 11

Commit

3ba1bc7

verified ·

1 Parent(s): 1b5933b

Upload 7 files

Browse files

Files changed (5) hide show

__init__.py +0 -0
config.py +25 -0
qwen.py +95 -0
search.py +85 -0
whisper.py +68 -0

__init__.py ADDED Viewed

File without changes

config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import logging
+from dotenv import load_dotenv
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Load environment variables
+load_dotenv()
+token = os.getenv("hf_key")
+# Set compute device (cpu/cuda)
+device = "cpu"
+logger.info(f"Device set to use {device}")
+# AI Assistant Configuration
+SYSTEM_PROMPT = """You are ConversAI, a helpful AI assistant who remembers conversation history. Keep responses clear, friendly and natural. Always refer to previous context when responding."""
+# Text-to-Speech Voice Settings (primary/backup)
+VOICE = "en-US-JennyNeural"
+FALLBACK_VOICES = ["en-US-ChristopherNeural", "en-US-EricNeural"]
+# Audio Output Configuration
+OUTPUT_FORMAT = "audio-24khz-48kbit-mono-mp3"

qwen.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import logging
+from typing import List, Dict, Optional, Tuple
+import torch
+# from transformers import pipeline
+from huggingface_hub import InferenceClient
+from config.config import token, SYSTEM_PROMPT
+from services.whisper import generate_speech, transcribe
+from services.search import WebSearcher
+logger = logging.getLogger(__name__)
+searcher = WebSearcher()
+# Qwen Configuration
+model_kwargs = {
+    "low_cpu_mem_usage": True,
+    "torch_dtype": torch.float32,
+    'use_cache': True
+}
+client = InferenceClient(
+    model="Qwen/Qwen2.5-0.5B-Instruct",
+    token=token
+    # trust_remote_code=True,
+    # device=device,
+    # model_kwargs=model_kwargs
+)
+async def respond(
+        audio: Optional[str] = None,
+        text: Optional[str] = None,
+        do_search: bool = False,
+        history: List[Dict] = None
+    ) -> Tuple[Optional[str], str]:
+    try:
+        if text:
+            user_text = text.strip()
+        elif audio:
+            user_text = await transcribe(audio)
+        else:
+            return None, "No input provided"
+        # Build conversation context
+        messages = []
+        messages.append({"role": "system", "content": SYSTEM_PROMPT})
+        if history:
+            messages.extend(history)
+        # Format message history for Qwen
+        prompt = ""
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
+        # Add current user message
+        prompt += f"<|im_start|>user\n{user_text}<|im_end|>\n<|im_start|>assistant\n"
+        # Add web-search context if enabled
+        if do_search:
+            results = searcher.search(user_text)
+            if results:
+                search_context = "Based on search results:\n"
+                for result in results:
+                    snippet = result['content'][:5000].strip()
+                    search_context += f"{snippet}\n"
+                prompt = prompt.replace(SYSTEM_PROMPT, f"{SYSTEM_PROMPT}\n{search_context}")
+        # Generate response
+        reply = client.text_generation(
+            prompt,
+            max_new_tokens=300,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            return_full_text=False
+        )
+        # Extract and clean assistant response
+        assistant_response = reply  # Reply is already the generated text string
+        if "<|im_start|>assistant\n" in assistant_response:
+            assistant_response = assistant_response.split("<|im_start|>assistant\n")[-1]
+        if "<|im_end|>" in assistant_response:
+            assistant_response = assistant_response.split("<|im_end|>")[0]
+        assistant_response = assistant_response.strip()
+        # Convert response to speech
+        audio_path = await generate_speech(assistant_response)
+        return audio_path, assistant_response
+    except Exception as e:
+        logger.error(f"Error in respond: {str(e)}")
+        return None, "Sorry, I encountered an error"

search.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import logging
+from typing import List, Dict
+import requests
+from bs4 import BeautifulSoup
+from urllib3.exceptions import InsecureRequestWarning
+# Disable SSL warnings for requests
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+logger = logging.getLogger(__name__)
+class WebSearcher:
+    def __init__(self):
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
+        }
+    def extract_text(self, html_content: str) -> str:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Remove unwanted elements
+        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
+            element.decompose()
+        text = ' '.join(soup.stripped_strings)
+        return text[:8000]  # Limit text length
+    def search(self, query: str, max_results: int = 3) -> List[Dict]:
+        results = []
+        try:
+            with requests.Session() as session:
+                # Google search parameters
+                search_url = "https://www.google.com/search"
+                params = {
+                    "q": query,
+                    "num": max_results,
+                    "hl": "en"
+                }
+                response = session.get(
+                    search_url,
+                    headers=self.headers,
+                    params=params,
+                    timeout=3,
+                    verify=False
+                )
+                response.raise_for_status()
+                # Parse search results
+                soup = BeautifulSoup(response.text, 'html.parser')
+                search_results = soup.select('div.g')
+                for result in search_results[:max_results]:
+                    link = result.find('a')
+                    if not link:
+                        continue
+                    url = link.get('href', '')
+                    if not url.startswith('http'):
+                        continue
+                    try:
+                        # Fetch webpage content
+                        page_response = session.get(
+                            url,
+                            headers=self.headers,
+                            timeout=5,
+                            verify=False
+                        )
+                        page_response.raise_for_status()
+                        content = self.extract_text(page_response.text)
+                        results.append({
+                            "url": url,
+                            "content": content
+                        })
+                        logger.info(f"Successfully fetched content from {url}")
+                    except Exception as e:
+                        logger.warning(f"Failed to fetch {url}: {str(e)}")
+                        continue
+        except Exception as e:
+            logger.error(f"Search failed: {str(e)}")
+        return results[:max_results]

whisper.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+import tempfile
+import logging
+import requests
+from typing import Optional
+import edge_tts
+from config.config import VOICE, FALLBACK_VOICES, token
+logger = logging.getLogger(__name__)
+# Whisper model for speech to text
+API_URL = "https://api-inference.huggingface.co/models/openai/whisper-tiny"
+headers = {"Authorization": f"Bearer {token}"}
+# Voice selection handling
+async def get_valid_voice() -> str:
+    available_voices = await edge_tts.list_voices()
+    voice_names = [VOICE] + FALLBACK_VOICES
+    available_voice_names = {v["ShortName"] for v in available_voices}
+    for voice in voice_names:
+        if voice in available_voice_names:
+            return voice
+    raise RuntimeError("No valid voice found")
+# Text-to-speech conversion using Edge TTS
+async def generate_speech(text: str) -> Optional[str]:
+    if not text or not isinstance(text, str):
+        raise ValueError("Invalid text input")
+    voice = await get_valid_voice()
+    logger.info(f"Using voice: {voice}")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(tmp_path)
+    if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
+        raise RuntimeError("Speech file empty or not created")
+    logger.info(f"Speech generated successfully: {tmp_path}")
+    return tmp_path
+# Speech-to-text using Whisper
+async def transcribe(audio_file: str) -> str:
+    try:
+        with open(audio_file, "rb") as f:
+            data = f.read()
+        response = requests.post(API_URL, headers=headers, data=data)
+        result = response.json()
+        if "text" in result:
+            transcription = result["text"].strip()
+            logger.info(f"Transcribed text: {transcription}")
+            return transcription
+        else:
+            raise ValueError("No transcription in response")
+    except Exception as e:
+        logger.error(f"Transcription error: {str(e)}")
+        raise RuntimeError(f"Failed to transcribe audio: {str(e)}")