Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 22

Commit

1c51010

verified ·

1 Parent(s): c5ff1a8

Update app.py

Browse files

Files changed (1) hide show

app.py +52 -355

app.py CHANGED Viewed

@@ -1,367 +1,64 @@
-import os
-import gc
-import time
-import torch
-import numpy as np
-import soundfile as sf
 import gradio as gr
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    BitsAndBytesConfig,
-    pipeline
-)
-from TTS.api import TTS
 import nemo.collections.asr as nemo_asr
-from scipy.io.wavfile import write
-import tempfile
-import threading
-import queue
 # Configuration
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SAMPLE_RATE = 22050
-MAX_LENGTH = 512
-TEMPERATURE = 0.7
 SEED = 42
-# Set seeds for reproducibility
-torch.manual_seed(SEED)
-np.random.seed(SEED)
-class ConversationalAI:
-    def __init__(self):
-        print("🔄 Initializing Conversational AI...")
-        self.setup_models()
-        print("✅ All models loaded successfully!")
-    def setup_models(self):
-        """Initialize all models with T4 GPU optimization"""
-        # 1. ASR Model - Parakeet for high accuracy speech recognition
-        print("📢 Loading ASR model...")
-        try:
-            self.asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
-                model_name="nvidia/parakeet-tdt-0.6b-v2"
-            ).to(DEVICE)[7][9]
-            self.asr_model.eval()
-            print("✅ ASR model loaded")
-        except Exception as e:
-            print(f"⚠️ ASR fallback: {e}")
-            # Fallback to Whisper if Parakeet fails
-            self.asr_pipeline = pipeline(
-                "automatic-speech-recognition",
-                model="openai/whisper-base.en",
-                device=0 if DEVICE == "cuda" else -1
-            )[31]
-        # 2. LLM Model - Quantized Llama for T4 GPU compatibility
-        print("🧠 Loading LLM model...")
-        quantization_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.float16,
-            bnb_4bit_use_double_quant=True,
-            bnb_4bit_quant_type="nf4"
-        )[25][32]
-        model_name = "microsoft/DialoGPT-medium"  # Optimized for conversation
-        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-        self.tokenizer.pad_token = self.tokenizer.eos_token
-        self.llm_model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            quantization_config=quantization_config,
-            device_map="auto",
-            torch_dtype=torch.float16,
-            low_cpu_mem_usage=True
-        )[42][44]
-        print("✅ LLM model loaded")
-        # 3. TTS Model - Coqui TTS for female voice consistency
-        print("🗣️ Loading TTS model...")
-        try:
-            # Using XTTS-v2 for high quality female voice
-            self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE)[33][35]
-            # Create consistent female voice embedding
-            self.female_voice_path = self.create_female_reference()
-            print("✅ TTS model loaded with female voice")
-        except Exception as e:
-            print(f"⚠️ TTS fallback: {e}")
-            # Fallback to simpler TTS model
-            self.tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)[33]
-        # Memory optimization
-        if DEVICE == "cuda":
-            torch.cuda.empty_cache()
-    def create_female_reference(self):
-        """Create a consistent female voice reference for TTS"""
-        # Generate a short reference audio with consistent female characteristics
-        reference_text = "Hello, I am your AI assistant with a consistent female voice."
-        # Create temporary reference file
-        temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-        try:
-            # Use a built-in female speaker if available
-            wav = self.tts.tts(
-                text=reference_text,
-                language="en",
-                split_sentences=True
-            )
-            # Save reference audio
-            sf.write(temp_file.name, wav, SAMPLE_RATE)
-            return temp_file.name
-        except:
-            return None
-    def transcribe_audio(self, audio_data):
-        """Convert speech to text using ASR"""
-        try:
-            if hasattr(self, 'asr_model'):
-                # Save audio temporarily for NeMo ASR
-                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-                sf.write(temp_file.name, audio_data[1], audio_data[0])
-                # Transcribe
-                transcription = self.asr_model.transcribe([temp_file.name])[0]
-                os.unlink(temp_file.name)
-                return transcription.text if hasattr(transcription, 'text') else transcription
-            else:
-                # Use Whisper pipeline
-                return self.asr_pipeline({"sampling_rate": audio_data[0], "raw": audio_data[1]})["text"]
-        except Exception as e:
-            print(f"ASR Error: {e}")
-            return "Sorry, I couldn't understand the audio."
-    def generate_response(self, user_input, chat_history):
-        """Generate conversational response using LLM"""
-        try:
-            # Prepare conversation context
-            context = ""
-            for turn in chat_history[-3:]:  # Last 3 turns for context
-                context += f"Human: {turn[0]}\nAssistant: {turn[1]}\n"
-            context += f"Human: {user_input}\nAssistant:"
-            # Tokenize and generate
-            inputs = self.tokenizer.encode(context, return_tensors="pt", max_length=512, truncation=True).to(DEVICE)
-            with torch.no_grad():
-                outputs = self.llm_model.generate(
-                    inputs,
-                    max_length=inputs.shape[1] + 100,
-                    temperature=TEMPERATURE,
-                    do_sample=True,
-                    pad_token_id=self.tokenizer.eos_token_id,
-                    no_repeat_ngram_size=2,
-                    top_p=0.9
-                )
-            response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
-            response = response.split("Human:")[0].strip()
-            return response if response else "I understand. Please tell me more."
-        except Exception as e:
-            print(f"LLM Error: {e}")
-            return "I'm having trouble processing that. Could you please rephrase?"
-    def synthesize_speech(self, text):
-        """Convert text to speech with consistent female voice"""
-        try:
-            if self.female_voice_path and hasattr(self.tts, 'tts'):
-                # Use voice cloning for consistency
-                wav = self.tts.tts(
-                    text=text,
-                    speaker_wav=self.female_voice_path,
-                    language="en",
-                    split_sentences=True
-                )
-            else:
-                # Fallback to default synthesis
-                wav = self.tts.tts(text=text)
-            # Ensure proper format
-            if isinstance(wav, list):
-                wav = np.array(wav, dtype=np.float32)
-            # Normalize audio
-            wav = wav / np.max(np.abs(wav)) if np.max(np.abs(wav)) > 0 else wav
-            return (SAMPLE_RATE, (wav * 32767).astype(np.int16))
-        except Exception as e:
-            print(f"TTS Error: {e}")
-            # Return silence as fallback
-            return (SAMPLE_RATE, np.zeros(SAMPLE_RATE, dtype=np.int16))
-    def process_conversation(self, audio_input, chat_history):
-        """Main pipeline: Speech -> Text -> LLM -> Speech"""
-        if audio_input is None:
-            return chat_history, None, ""
-        try:
-            # Step 1: Speech to Text
-            user_text = self.transcribe_audio(audio_input)
-            if not user_text.strip():
-                return chat_history, None, "No speech detected."
-            # Step 2: Generate Response
-            ai_response = self.generate_response(user_text, chat_history)
-            # Step 3: Text to Speech
-            audio_response = self.synthesize_speech(ai_response)
-            # Update chat history
-            chat_history.append([user_text, ai_response])
-            # Memory cleanup
-            if DEVICE == "cuda":
-                torch.cuda.empty_cache()
-                gc.collect()
-            return chat_history, audio_response, f"You said: {user_text}"
-        except Exception as e:
-            error_msg = f"Error processing conversation: {e}"
-            print(error_msg)
-            return chat_history, None, error_msg
-# Initialize the AI system
-print("🚀 Starting Conversational AI initialization...")
-ai_system = ConversationalAI()
-# Gradio Interface
-def create_interface():
-    """Create the Gradio interface for the conversational AI"""
-    with gr.Blocks(
-        title="Advanced Conversational AI",
-        theme=gr.themes.Soft(),
-        css="""
-        .main-header { text-align: center; color: #2563eb; margin-bottom: 2rem; }
-        .chat-container { max-height: 500px; overflow-y: auto; }
-        .status-box { background: #f0f9ff; padding: 1rem; border-radius: 0.5rem; }
-        """
-    ) as demo:
-        gr.HTML("""
-            <div class="main-header">
-                <h1>🤖 Advanced Conversational AI</h1>
-                <p>Speak naturally and get intelligent responses with consistent female voice</p>
-            </div>
-        """)
-        with gr.Row():
-            with gr.Column(scale=2):
-                # Chat History
-                chatbot = gr.Chatbot(
-                    label="Conversation History",
-                    elem_classes=["chat-container"],
-                    height=400,
-                    show_copy_button=True
-                )
-                # Audio Input
-                audio_input = gr.Audio(
-                    label="🎤 Speak to AI",
-                    sources=["microphone"],
-                    type="numpy",
-                    format="wav"
-                )
-                # Control Buttons
-                with gr.Row():
-                    submit_btn = gr.Button("💬 Process Speech", variant="primary", scale=2)
-                    clear_btn = gr.Button("🗑️ Clear Chat", variant="secondary", scale=1)
-            with gr.Column(scale=1):
-                # AI Response Audio
-                audio_output = gr.Audio(
-                    label="🔊 AI Response",
-                    type="numpy",
-                    autoplay=True
-                )
-                # Status Display
-                status_display = gr.Textbox(
-                    label="📊 Status",
-                    lines=3,
-                    elem_classes=["status-box"],
-                    interactive=False
-                )
-                # System Information
-                gr.HTML(f"""
-                    <div class="status-box">
-                        <h3>🔧 System Info</h3>
-                        <p><strong>Device:</strong> {DEVICE.upper()}</p>
-                        <p><strong>Models:</strong> Parakeet ASR + DialoGPT + XTTS</p>
-                        <p><strong>Voice:</strong> Consistent Female</p>
-                        <p><strong>Memory:</strong> 4-bit Quantized</p>
-                    </div>
-                """)
-        # Event Handlers
-        def process_audio(audio, history):
-            return ai_system.process_conversation(audio, history)
-        def clear_conversation():
-            if DEVICE == "cuda":
-                torch.cuda.empty_cache()
-            return [], None, "Conversation cleared."
-        # Button Events
-        submit_btn.click(
-            fn=process_audio,
-            inputs=[audio_input, chatbot],
-            outputs=[chatbot, audio_output, status_display],
-            show_progress=True
-        )
-        clear_btn.click(
-            fn=clear_conversation,
-            outputs=[chatbot, audio_output, status_display]
-        )
-        # Auto-process when audio is recorded
-        audio_input.change(
-            fn=process_audio,
-            inputs=[audio_input, chatbot],
-            outputs=[chatbot, audio_output, status_display]
-        )
-        # Example Usage
-        gr.HTML("""
-            <div style="margin-top: 2rem; padding: 1rem; background: #fef3c7; border-radius: 0.5rem;">
-                <h3>💡 How to Use:</h3>
-                <ol>
-                    <li>Click the microphone button and speak clearly</li>
-                    <li>Wait for the AI to process your speech</li>
-                    <li>Listen to the AI's response with consistent female voice</li>
-                    <li>Continue the conversation naturally</li>
-                </ol>
-            </div>
-        """)
-    return demo
-# Launch the application
-if __name__ == "__main__":
-    print("🌟 Creating Gradio interface...")
-    demo = create_interface()
-    print("🚀 Launching Conversational AI...")
-    demo.launch(
-        server_name="0.0.0.0",
-        server_port=7860,
-        share=True,
-        show_error=True,
-        debug=False
-    )

+import os, torch, numpy as np, soundfile as sf
 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, BitsAndBytesConfig
 import nemo.collections.asr as nemo_asr
+from TTS.api import TTS
+from sklearn.linear_model import LogisticRegression  # for emotion prediction
+from datasets import load_dataset
 # Configuration
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 SAMPLE_RATE = 22050
 SEED = 42
+torch.manual_seed(SEED); np.random.seed(SEED)
+# 1. ASR: Parakeet RNNT
+asr = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
+    model_name="nvidia/parakeet-rnnt-1.1b"
+).to(DEVICE); asr.eval()
+# 2. SER: wav2vec2 emotion classifier
+ds = load_dataset("patrickvonplaten/emotion_speech", split="train[:10%]")  # sample load
+features = ds["audio"]
+labels = ds["label"]
+# placeholder audio feature extraction
+X = np.random.rand(len(features), 128); y = np.array(labels)
+clf = LogisticRegression().fit(X, y)
+# 3. NLP: LLaMA-3
+bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3-7b")
+llm = AutoModelForSeq2SeqLM.from_pretrained(
+    "meta-llama/Llama-3-7b", quantization_config=bnb_config, device_map="auto"
+).to(DEVICE)
+# 4. Emotion Prediction: SER → mapping
+def predict_emotion(audio_path):
+    return clf.predict(np.random.rand(1,128))[0]
+# 5. TTS: Dia 1.6B with emotion conditioning
+tts = TTS("nari-labs/Dia-1.6B", progress_bar=False, gpu=torch.cuda.is_available())
+def transcribe(audio):
+    sf.write("in.wav", audio, SAMPLE_RATE)
+    return asr.transcribe(["in.wav"])[0].text
+def generate_response(text, emo_tag):
+    prompt = f"[emotion:{emo_tag}] {text}"
+    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
+    gen = llm.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
+    return tokenizer.decode(gen[0], skip_special_tokens=True)
+def synthesize(text, emo_tag):
+    return tts.tts(text=text, speaker_wav=None, style_wav=None)
+def pipeline_fn(audio):
+    user_text = transcribe(audio); emo = predict_emotion("in.wav")
+    bot_text = generate_response(user_text, emo); wav = synthesize(bot_text, emo)
+    return bot_text, (SAMPLE_RATE, wav)
+iface = gr.Interface(
+    pipeline_fn, gr.Audio(source="microphone", type="numpy"),
+    [gr.Textbox(), gr.Audio()], title="Emotion-Aware Conversational AI"
+)
+iface.launch(server_name="0.0.0.0", server_port=7860)