Spaces:

pratham0011
/

ConversAI_AI-Voice-Chat-Assistant

Sleeping

App Files Files Community

pratham0011 commited on Jan 11

Commit

d89ceaa

verified ·

1 Parent(s): d3b6224

Upload 8 files

Browse files

Files changed (8) hide show

app.py +100 -0
config/__init__.py +0 -0
config/config.py +25 -0
requirements.txt +11 -0
services/__init__.py +0 -0
services/qwen.py +93 -0
services/search.py +85 -0
services/whisper.py +92 -0

app.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import asyncio
+import logging
+import gradio as gr
+from services.qwen import respond
+logger = logging.getLogger(__name__)
+# Track conversation state
+conversation_history = []
+def clear_conversation():
+    global conversation_history
+    conversation_history = []
+    return [],None
+def sync_respond(audio, text_input, do_search, history):
+    if not audio and not text_input:
+        return None, history
+    logger.info(f"Processing request with search enabled: {do_search}")
+    result = asyncio.run(respond(audio, text_input, do_search, history))
+    audio_path, response_text = result
+    if audio:
+        user_message = {"role": "user", "content": "Voice message"}
+    else:
+        user_message = {"role": "user", "content": text_input}
+    assistant_message = {"role": "assistant", "content": response_text}
+    history.extend([user_message, assistant_message])
+    return audio_path, history
+# Build Gradio interface
+with gr.Blocks(theme=gr.themes.Soft()) as interface:
+    gr.Markdown(
+        """
+        <div style="text-align: center; margin-bottom: 1rem;">
+            <h1 style="font-weight: bold;">ConversAI: AI Voice & Chat Assistant</h1>
+        </div>
+        """,
+        show_label=False
+    )
+    # Input components (left column)
+    with gr.Row():
+        with gr.Column(scale=2):
+            audio_input = gr.Audio(
+                label="Your Voice Input",
+                type="filepath",
+                sources=["microphone"]
+            )
+            text_input = gr.Textbox(
+                label="Or Type Your Message",
+                placeholder="Type here..."
+            )
+            search_checkbox = gr.Checkbox(
+                label="Enable web search",
+                value=False
+                )
+            clear_btn = gr.Button("Clear Chat")
+        # Output components (right column)
+        with gr.Column(scale=3):
+            chatbot = gr.Chatbot(label="Conversation", type="messages")
+            audio_output = gr.Audio(
+                label="AI Voice Response",
+                type="filepath",
+                autoplay=True
+            )
+    # Define input event handlers
+    input_events = [
+        audio_input.change(
+            fn=sync_respond,
+            inputs=[audio_input, text_input,search_checkbox, chatbot],
+            outputs=[audio_output, chatbot]
+        ),
+        text_input.submit(
+            fn=sync_respond,
+            inputs=[audio_input, text_input, search_checkbox, chatbot],
+            outputs=[audio_output, chatbot]
+        )
+    ]
+    # Clear chat button handler
+    clear_btn.click(
+        fn=clear_conversation,
+        outputs=[chatbot, audio_output]
+    )
+# Start server
+if __name__ == "__main__":
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        debug=True
+    )

config/__init__.py ADDED Viewed

File without changes

config/config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import os
+import logging
+from dotenv import load_dotenv
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Load environment variables
+load_dotenv()
+token = os.getenv("hf_key")
+# Set compute device (cpu/cuda)
+device = "cpu"
+logger.info(f"Device set to use {device}")
+# AI Assistant Configuration
+SYSTEM_PROMPT = """You are ConversAI, a helpful AI assistant who remembers conversation history. Keep responses clear, friendly and natural. Always refer to previous context when responding."""
+# Text-to-Speech Voice Settings (primary/backup)
+VOICE = "en-US-JennyNeural"
+FALLBACK_VOICES = ["en-US-ChristopherNeural", "en-US-EricNeural"]
+# Audio Output Configuration
+OUTPUT_FORMAT = "audio-24khz-48kbit-mono-mp3"

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio
+edge-tts
+numpy
+soxr
+pydub
+torch
+sentencepiece
+onnxruntime
+huggingface-hub
+python-dotenv
+asyncio

services/__init__.py ADDED Viewed

File without changes

services/qwen.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import logging
+from typing import List, Dict, Optional, Tuple
+import torch
+from transformers import pipeline
+from transformers import pipeline
+from config.config import token, device, SYSTEM_PROMPT
+from services.whisper import generate_speech, transcribe
+from services.search import WebSearcher
+logger = logging.getLogger(__name__)
+searcher = WebSearcher()
+# Qwen Configuration
+model_kwargs = {
+    "low_cpu_mem_usage": True,
+    "torch_dtype": torch.float32,
+    'use_cache': True
+}
+client = pipeline(
+    "text-generation",
+    model="Qwen/Qwen2.5-0.5B-Instruct",
+    token=token,
+    trust_remote_code=True,
+    device=device,
+    model_kwargs=model_kwargs
+)
+async def respond(
+        audio: Optional[str] = None,
+        text: Optional[str] = None,
+        do_search: bool = False,
+        history: List[Dict] = None
+    ) -> Tuple[Optional[str], str]:
+    try:
+        if text:
+            user_text = text.strip()
+        elif audio:
+            user_text = await transcribe(audio)
+        else:
+            return None, "No input provided"
+        # Build conversation context
+        messages = []
+        messages.append({"role": "system", "content": SYSTEM_PROMPT})
+        if history:
+            messages.extend(history)
+        # Format message history for Qwen
+        prompt = ""
+        for msg in messages:
+            role = msg["role"]
+            content = msg["content"]
+            prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
+        # Add current user message
+        prompt += f"<|im_start|>user\n{user_text}<|im_end|>\n<|im_start|>assistant\n"
+        # Add web-search context if enabled
+        if do_search:
+            results = searcher.search(user_text)
+            if results:
+                search_context = "Based on search results:\n"
+                for result in results:
+                    snippet = result['content'][:500].strip()
+                    search_context += f"{snippet}\n"
+                prompt = prompt.replace(SYSTEM_PROMPT, f"{SYSTEM_PROMPT}\n{search_context}")
+        # Generate response
+        reply = client(
+            prompt,
+            max_new_tokens=400,
+            do_sample=True,
+            temperature=0.7,
+            top_p=0.9,
+            num_return_sequences=1
+        )
+        # Extract and clean assistant response
+        assistant_response = reply[0]['generated_text']
+        assistant_response = assistant_response.split("<|im_start|>assistant\n")[-1]
+        assistant_response = assistant_response.split("<|im_end|>")[0].strip()
+        # Convert response to speech
+        audio_path = await generate_speech(assistant_response)
+        return audio_path, assistant_response
+    except Exception as e:
+        logger.error(f"Error in respond: {str(e)}")
+        return None, "Sorry, I encountered an error"

services/search.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import logging
+from typing import List, Dict
+import requests
+from bs4 import BeautifulSoup
+from urllib3.exceptions import InsecureRequestWarning
+# Disable SSL warnings for requests
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+logger = logging.getLogger(__name__)
+class WebSearcher:
+    def __init__(self):
+        self.headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"
+        }
+    def extract_text(self, html_content: str) -> str:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        # Remove unwanted elements
+        for element in soup(['script', 'style', 'nav', 'header', 'footer', 'iframe']):
+            element.decompose()
+        text = ' '.join(soup.stripped_strings)
+        return text[:8000]  # Limit text length
+    def search(self, query: str, max_results: int = 3) -> List[Dict]:
+        results = []
+        try:
+            with requests.Session() as session:
+                # Google search parameters
+                search_url = "https://www.google.com/search"
+                params = {
+                    "q": query,
+                    "num": max_results,
+                    "hl": "en"
+                }
+                response = session.get(
+                    search_url,
+                    headers=self.headers,
+                    params=params,
+                    timeout=10,
+                    verify=False
+                )
+                response.raise_for_status()
+                # Parse search results
+                soup = BeautifulSoup(response.text, 'html.parser')
+                search_results = soup.select('div.g')
+                for result in search_results[:max_results]:
+                    link = result.find('a')
+                    if not link:
+                        continue
+                    url = link.get('href', '')
+                    if not url.startswith('http'):
+                        continue
+                    try:
+                        # Fetch webpage content
+                        page_response = session.get(
+                            url,
+                            headers=self.headers,
+                            timeout=5,
+                            verify=False
+                        )
+                        page_response.raise_for_status()
+                        content = self.extract_text(page_response.text)
+                        results.append({
+                            "url": url,
+                            "content": content
+                        })
+                        logger.info(f"Successfully fetched content from {url}")
+                    except Exception as e:
+                        logger.warning(f"Failed to fetch {url}: {str(e)}")
+                        continue
+        except Exception as e:
+            logger.error(f"Search failed: {str(e)}")
+        return results[:max_results]

services/whisper.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import tempfile
+import logging
+from typing import Optional
+import torch
+import librosa
+import edge_tts
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from config.config import VOICE, FALLBACK_VOICES
+logger = logging.getLogger(__name__)
+# Whisper model for speech to text
+processor = WhisperProcessor.from_pretrained(
+    "openai/whisper-tiny",
+    local_files_only=False
+)
+model = WhisperForConditionalGeneration.from_pretrained(
+    "openai/whisper-tiny",
+    local_files_only=False,
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float32,
+).to("cpu")
+# Voice selection handling
+async def get_valid_voice() -> str:
+    available_voices = await edge_tts.list_voices()
+    voice_names = [VOICE] + FALLBACK_VOICES
+    available_voice_names = {v["ShortName"] for v in available_voices}
+    for voice in voice_names:
+        if voice in available_voice_names:
+            return voice
+    raise RuntimeError("No valid voice found")
+# Text-to-speech conversion using Edge TTS
+async def generate_speech(text: str) -> Optional[str]:
+    if not text or not isinstance(text, str):
+        raise ValueError("Invalid text input")
+    voice = await get_valid_voice()
+    logger.info(f"Using voice: {voice}")
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
+        tmp_path = tmp_file.name
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(tmp_path)
+    if not os.path.exists(tmp_path) or os.path.getsize(tmp_path) == 0:
+        raise RuntimeError("Speech file empty or not created")
+    logger.info(f"Speech generated successfully: {tmp_path}")
+    return tmp_path
+# Speech-to-text using Whisper
+async def transcribe(audio_file: str) -> str:
+    audio, sr = librosa.load(
+        audio_file,
+        sr=16000,
+        mono=True,
+        duration=30
+    )
+    inputs = processor(
+        audio,
+        sampling_rate=sr,
+        return_tensors="pt",
+        return_attention_mask=True
+    ).to(model.device)
+    with torch.no_grad():
+        generated_ids = model.generate(
+            input_features=inputs.input_features,
+            attention_mask=inputs.attention_mask,
+            language="en",
+            task="transcribe",
+            max_length=448,
+            temperature=0.0
+        )
+        transcription = processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True
+        )[0].strip()
+    logger.info(f"Transcribed text: {transcription}")
+    return transcription