Spaces:

Devakumar868
/

my-voice-assistant

Paused

App Files Files Community

Devakumar868 commited on Jun 22

Commit

cf427d1

verified ·

1 Parent(s): fc9eb64

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -53

app.py CHANGED Viewed

@@ -1,64 +1,194 @@
-import os, torch, numpy as np, soundfile as sf
-import gradio as gr
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, BitsAndBytesConfig
 import nemo.collections.asr as nemo_asr
-from TTS.api import TTS
-from sklearn.linear_model import LogisticRegression  # for emotion prediction
 from datasets import load_dataset
 # Configuration
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-SAMPLE_RATE = 22050
-SEED = 42
 torch.manual_seed(SEED); np.random.seed(SEED)
-# 1. ASR: Parakeet RNNT
-asr = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
-    model_name="nvidia/parakeet-rnnt-1.1b"
-).to(DEVICE); asr.eval()
-# 2. SER: wav2vec2 emotion classifier
-ds = load_dataset("patrickvonplaten/emotion_speech", split="train[:10%]")  # sample load
-features = ds["audio"]
-labels = ds["label"]
-# placeholder audio feature extraction
-X = np.random.rand(len(features), 128); y = np.array(labels)
-clf = LogisticRegression().fit(X, y)
-# 3. NLP: LLaMA-3
-bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16)
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3-7b")
-llm = AutoModelForSeq2SeqLM.from_pretrained(
-    "meta-llama/Llama-3-7b", quantization_config=bnb_config, device_map="auto"
-).to(DEVICE)
-# 4. Emotion Prediction: SER → mapping
-def predict_emotion(audio_path):
-    return clf.predict(np.random.rand(1,128))[0]
-# 5. TTS: Dia 1.6B with emotion conditioning
-tts = TTS("nari-labs/Dia-1.6B", progress_bar=False, gpu=torch.cuda.is_available())
-def transcribe(audio):
-    sf.write("in.wav", audio, SAMPLE_RATE)
-    return asr.transcribe(["in.wav"])[0].text
-def generate_response(text, emo_tag):
-    prompt = f"[emotion:{emo_tag}] {text}"
-    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
-    gen = llm.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)
-    return tokenizer.decode(gen[0], skip_special_tokens=True)
-def synthesize(text, emo_tag):
-    return tts.tts(text=text, speaker_wav=None, style_wav=None)
-def pipeline_fn(audio):
-    user_text = transcribe(audio); emo = predict_emotion("in.wav")
-    bot_text = generate_response(user_text, emo); wav = synthesize(bot_text, emo)
-    return bot_text, (SAMPLE_RATE, wav)
-iface = gr.Interface(
-    pipeline_fn, gr.Audio(source="microphone", type="numpy"),
-    [gr.Textbox(), gr.Audio()], title="Emotion-Aware Conversational AI"
-)
-iface.launch(server_name="0.0.0.0", server_port=7860)

+import os, torch, numpy as np, soundfile as sf, gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
 import nemo.collections.asr as nemo_asr
+from TTS.api import TTS  # Note: using TTS, not coqui_tts
+from sklearn.linear_model import LogisticRegression
 from datasets import load_dataset
+import tempfile
 # Configuration
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+SEED = 42; SAMPLE_RATE = 22050; TEMPERATURE = 0.7
 torch.manual_seed(SEED); np.random.seed(SEED)
+print(f"Using device: {DEVICE}")
+print(f"NumPy version: {np.__version__}")
+print(f"PyTorch version: {torch.__version__}")
+class ConversationalAI:
+    def __init__(self):
+        print("🔄 Initializing Conversational AI...")
+        self.setup_models()
+        print("✅ All models loaded successfully!")
+    def setup_models(self):
+        # 1. ASR: Parakeet RNNT
+        print("📢 Loading ASR model...")
+        try:
+            self.asr_model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(
+                "nvidia/parakeet-rnnt-1.1b"
+            ).to(DEVICE).eval()
+            print("✅ ASR model loaded")
+        except Exception as e:
+            print(f"⚠️ ASR error: {e}")
+            # Fallback to Whisper
+            self.asr_pipeline = pipeline(
+                "automatic-speech-recognition",
+                model="openai/whisper-base.en",
+                device=0 if DEVICE == "cuda" else -1
+            )
+        # 2. SER: Simple emotion classifier (demo)
+        print("🎭 Setting up emotion recognition...")
+        # Create dummy SER for demo
+        X_demo = np.random.rand(100, 128)
+        y_demo = np.random.randint(0, 5, 100)  # 5 emotion classes
+        self.ser_clf = LogisticRegression().fit(X_demo, y_demo)
+        # 3. LLM: Quantized model for conversation
+        print("🧠 Loading LLM model...")
+        bnb_cfg = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16
+        )
+        model_name = "microsoft/DialoGPT-medium"
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.llm_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            quantization_config=bnb_cfg,
+            device_map="auto",
+            torch_dtype=torch.float16
+        )
+        print("✅ LLM model loaded")
+        # 4. TTS: Coqui TTS for speech synthesis
+        print("🗣️ Loading TTS model...")
+        try:
+            self.tts = TTS("tts_models/en/ljspeech/tacotron2-DDC").to(DEVICE)
+            print("✅ TTS model loaded")
+        except Exception as e:
+            print(f"⚠️ TTS error: {e}")
+            self.tts = None
+        # Memory cleanup
+        if DEVICE == "cuda":
+            torch.cuda.empty_cache()
+    def transcribe(self, audio):
+        try:
+            if hasattr(self, 'asr_model'):
+                temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+                sf.write(temp_file.name, audio[1], audio[0])
+                transcription = self.asr_model.transcribe([temp_file.name])[0]
+                os.unlink(temp_file.name)
+                return transcription.text if hasattr(transcription, 'text') else str(transcription)
+            else:
+                return self.asr_pipeline({"sampling_rate": audio[0], "raw": audio[1]})["text"]
+        except Exception as e:
+            print(f"ASR Error: {e}")
+            return "Sorry, I couldn't understand the audio."
+    def predict_emotion(self):
+        # Simple emotion prediction (demo)
+        return self.ser_clf.predict(np.random.rand(1, 128))[0]
+    def generate_response(self, text, emo):
+        try:
+            prompt = f"Human: {text}\nAssistant:"
+            inputs = self.tokenizer.encode(prompt, return_tensors="pt", max_length=512, truncation=True).to(DEVICE)
+            with torch.no_grad():
+                outputs = self.llm_model.generate(
+                    inputs,
+                    max_length=inputs.shape[1] + 100,
+                    temperature=TEMPERATURE,
+                    do_sample=True,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            response = self.tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
+            return response.split("Human:")[0].strip() or "I understand. Please tell me more."
+        except Exception as e:
+            print(f"LLM Error: {e}")
+            return "I'm having trouble processing that. Could you please rephrase?"
+    def synthesize(self, text):
+        try:
+            if self.tts:
+                wav = self.tts.tts(text=text)
+                if isinstance(wav, list):
+                    wav = np.array(wav, dtype=np.float32)
+                wav = wav / np.max(np.abs(wav)) if np.max(np.abs(wav)) > 0 else wav
+                return (SAMPLE_RATE, (wav * 32767).astype(np.int16))
+            else:
+                return (SAMPLE_RATE, np.zeros(SAMPLE_RATE, dtype=np.int16))
+        except Exception as e:
+            print(f"TTS Error: {e}")
+            return (SAMPLE_RATE, np.zeros(SAMPLE_RATE, dtype=np.int16))
+    def process_conversation(self, audio_input, chat_history):
+        if audio_input is None:
+            return chat_history, None, ""
+        try:
+            # Pipeline: ASR -> SER -> LLM -> TTS
+            user_text = self.transcribe(audio_input)
+            if not user_text.strip():
+                return chat_history, None, "No speech detected."
+            emo = self.predict_emotion()
+            ai_response = self.generate_response(user_text, emo)
+            audio_response = self.synthesize(ai_response)
+            chat_history.append([user_text, ai_response])
+            if DEVICE == "cuda":
+                torch.cuda.empty_cache()
+            return chat_history, audio_response, f"You said: {user_text}"
+        except Exception as e:
+            error_msg = f"Error: {e}"
+            print(error_msg)
+            return chat_history, None, error_msg
+# Initialize AI system
+print("🚀 Starting initialization...")
+ai_system = ConversationalAI()
+# Gradio interface
+def create_interface():
+    with gr.Blocks(title="Emotion-Aware Conversational AI") as demo:
+        gr.HTML("<h1>🤖 Emotion-Aware Conversational AI</h1>")
+        with gr.Row():
+            with gr.Column():
+                chatbot = gr.Chatbot(label="Conversation", height=400)
+                audio_input = gr.Audio(label="🎤 Speak", sources=["microphone"], type="numpy")
+                with gr.Row():
+                    submit_btn = gr.Button("💬 Process", variant="primary")
+                    clear_btn = gr.Button("🗑️ Clear", variant="secondary")
+            with gr.Column():
+                audio_output = gr.Audio(label="🔊 AI Response", autoplay=True)
+                status = gr.Textbox(label="📊 Status", lines=3, interactive=False)
+        def process_audio(audio, history):
+            return ai_system.process_conversation(audio, history)
+        def clear_chat():
+            return [], None, "Conversation cleared."
+        submit_btn.click(process_audio, [audio_input, chatbot], [chatbot, audio_output, status])
+        clear_btn.click(clear_chat, outputs=[chatbot, audio_output, status])
+        audio_input.change(process_audio, [audio_input, chatbot], [chatbot, audio_output, status])
+    return demo
+# Launch
+if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch(server_name="0.0.0.0", server_port=7860, share=True)