Spaces:

shukdevdattaEX
/

Canary-Video-Chat

Running

App Files Files Community

shukdevdattaEX commited on 26 days ago

Commit

25e56df

verified ·

1 Parent(s): c8a787f

Create app.py

Browse files

Files changed (1) hide show

app.py +364 -0

app.py ADDED Viewed

	@@ -0,0 +1,364 @@

+import gradio as gr
+import os
+import tempfile
+import subprocess
+import librosa
+import soundfile as sf
+import torch
+from pathlib import Path
+import traceback
+from typing import List, Dict, Tuple, Optional
+# Install required packages
+def install_requirements():
+    """Install required packages if not already installed"""
+    try:
+        import nemo
+        print("NeMo already installed")
+    except ImportError:
+        print("Installing NeMo...")
+        subprocess.run([
+            "pip", "install",
+            "nemo_toolkit[asr,tts] @ git+https://github.com/NVIDIA/NeMo.git"
+        ], check=True)
+    try:
+        import moviepy
+        print("MoviePy already installed")
+    except ImportError:
+        print("Installing MoviePy...")
+        subprocess.run(["pip", "install", "moviepy"], check=True)
+# Try to install requirements
+try:
+    install_requirements()
+    from nemo.collections.speechlm2.models import SALM
+    import moviepy.editor as mp
+    DEPENDENCIES_AVAILABLE = True
+except Exception as e:
+    print(f"Warning: Could not install dependencies: {e}")
+    DEPENDENCIES_AVAILABLE = False
+class VideoQASummarizer:
+    def __init__(self):
+        self.model = None
+        self.current_transcript = ""
+        self.model_loaded = False
+    def load_model(self):
+        """Load the Canary-Qwen-2.5B model"""
+        if not DEPENDENCIES_AVAILABLE:
+            return "Error: Required dependencies not available. Please install manually."
+        try:
+            if self.model is None:
+                print("Loading Canary-Qwen-2.5B model...")
+                self.model = SALM.from_pretrained('nvidia/canary-qwen-2.5b')
+                self.model_loaded = True
+                return "Model loaded successfully!"
+            return "Model already loaded."
+        except Exception as e:
+            error_msg = f"Error loading model: {str(e)}"
+            print(error_msg)
+            print(traceback.format_exc())
+            return error_msg
+    def extract_audio_from_video(self, video_path: str) -> str:
+        """Extract audio from video file"""
+        try:
+            # Create temporary audio file
+            temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
+            temp_audio_path = temp_audio.name
+            temp_audio.close()
+            # Load video and extract audio
+            video = mp.VideoFileClip(video_path)
+            audio = video.audio
+            # Write audio to temporary file
+            audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
+            # Clean up
+            audio.close()
+            video.close()
+            return temp_audio_path
+        except Exception as e:
+            raise Exception(f"Error extracting audio: {str(e)}")
+    def preprocess_audio(self, audio_path: str) -> str:
+        """Preprocess audio for the model (ensure correct format)"""
+        try:
+            # Load audio
+            audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16kHz if needed
+            # Create new temporary file for processed audio
+            temp_processed = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
+            temp_processed_path = temp_processed.name
+            temp_processed.close()
+            # Save processed audio
+            sf.write(temp_processed_path, audio, 16000)
+            return temp_processed_path
+        except Exception as e:
+            raise Exception(f"Error preprocessing audio: {str(e)}")
+    def transcribe_audio(self, audio_path: str) -> str:
+        """Transcribe audio using Canary-Qwen-2.5B in ASR mode"""
+        try:
+            if not self.model_loaded:
+                return "Error: Model not loaded. Please load the model first."
+            # Preprocess audio
+            processed_audio_path = self.preprocess_audio(audio_path)
+            # Transcribe using ASR mode
+            answer_ids = self.model.generate(
+                prompts=[
+                    [{"role": "user", "content": f"Transcribe the following: {self.model.audio_locator_tag}", "audio": [processed_audio_path]}]
+                ],
+                max_new_tokens=512,
+            )
+            transcript = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
+            # Clean up temporary file
+            os.unlink(processed_audio_path)
+            return transcript.strip()
+        except Exception as e:
+            error_msg = f"Error during transcription: {str(e)}"
+            print(error_msg)
+            print(traceback.format_exc())
+            return error_msg
+    def answer_question(self, question: str, transcript: str) -> str:
+        """Answer questions about the transcript using LLM mode"""
+        try:
+            if not self.model_loaded:
+                return "Error: Model not loaded. Please load the model first."
+            if not transcript:
+                return "Error: No transcript available. Please transcribe a video first."
+            # Use LLM mode to answer questions
+            prompt = f"Based on the following transcript, please answer this question: {question}\n\nTranscript: {transcript}"
+            with self.model.llm.disable_adapter():
+                answer_ids = self.model.generate(
+                    prompts=[[{"role": "user", "content": prompt}]],
+                    max_new_tokens=512,
+                )
+            answer = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
+            return answer.strip()
+        except Exception as e:
+            error_msg = f"Error answering question: {str(e)}"
+            print(error_msg)
+            print(traceback.format_exc())
+            return error_msg
+    def summarize_transcript(self, transcript: str, summary_type: str = "general") -> str:
+        """Summarize the transcript using LLM mode"""
+        try:
+            if not self.model_loaded:
+                return "Error: Model not loaded. Please load the model first."
+            if not transcript:
+                return "Error: No transcript available. Please transcribe a video first."
+            # Create different summary prompts based on type
+            if summary_type == "bullet_points":
+                prompt = f"Please create a bullet-point summary of the key points from this transcript:\n\n{transcript}"
+            elif summary_type == "detailed":
+                prompt = f"Please provide a detailed summary of this transcript, including main topics and important details:\n\n{transcript}"
+            else:  # general
+                prompt = f"Please provide a concise summary of this transcript:\n\n{transcript}"
+            with self.model.llm.disable_adapter():
+                answer_ids = self.model.generate(
+                    prompts=[[{"role": "user", "content": prompt}]],
+                    max_new_tokens=1024,
+                )
+            summary = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
+            return summary.strip()
+        except Exception as e:
+            error_msg = f"Error creating summary: {str(e)}"
+            print(error_msg)
+            print(traceback.format_exc())
+            return error_msg
+# Initialize the model
+qa_summarizer = VideoQASummarizer()
+def load_model_interface():
+    """Interface function to load the model"""
+    return qa_summarizer.load_model()
+def process_video(video_file):
+    """Process uploaded video and return transcript"""
+    if video_file is None:
+        return "Please upload a video file.", ""
+    try:
+        # Extract audio from video
+        status_msg = "Extracting audio from video..."
+        audio_path = qa_summarizer.extract_audio_from_video(video_file)
+        # Transcribe audio
+        status_msg = "Transcribing audio..."
+        transcript = qa_summarizer.transcribe_audio(audio_path)
+        # Store transcript for later use
+        qa_summarizer.current_transcript = transcript
+        # Clean up temporary audio file
+        if os.path.exists(audio_path):
+            os.unlink(audio_path)
+        return "Video processed successfully!", transcript
+    except Exception as e:
+        error_msg = f"Error processing video: {str(e)}"
+        print(error_msg)
+        print(traceback.format_exc())
+        return error_msg, ""
+def answer_question_interface(question, transcript):
+    """Interface function to answer questions"""
+    if not question.strip():
+        return "Please enter a question."
+    return qa_summarizer.answer_question(question, transcript or qa_summarizer.current_transcript)
+def summarize_interface(transcript, summary_type):
+    """Interface function to create summaries"""
+    return qa_summarizer.summarize_transcript(transcript or qa_summarizer.current_transcript, summary_type)
+# Create Gradio interface
+def create_interface():
+    with gr.Blocks(title="Video Q&A and Summarizer", theme=gr.themes.Soft()) as app:
+        gr.Markdown("""
+        # 🎥 Video Question Answering and Summarizer
+        Upload a video file to transcribe its audio content, then ask questions or generate summaries using NVIDIA's Canary-Qwen-2.5B model.
+        **Features:**
+        - Extract and transcribe audio from video files
+        - Ask questions about the video content
+        - Generate different types of summaries
+        - Powered by NVIDIA NeMo Canary-Qwen-2.5B
+        """)
+        # Model loading section
+        with gr.Row():
+            gr.Markdown("## 🚀 Step 1: Load Model")
+        with gr.Row():
+            load_btn = gr.Button("Load Canary-Qwen-2.5B Model", variant="primary")
+            model_status = gr.Textbox(label="Model Status", interactive=False)
+        load_btn.click(load_model_interface, outputs=model_status)
+        # Video processing section
+        with gr.Row():
+            gr.Markdown("## 📹 Step 2: Upload and Process Video")
+        with gr.Row():
+            with gr.Column():
+                video_input = gr.Video(label="Upload Video File")
+                process_btn = gr.Button("Process Video", variant="primary")
+            with gr.Column():
+                process_status = gr.Textbox(label="Processing Status", interactive=False)
+                transcript_output = gr.Textbox(
+                    label="Transcript",
+                    lines=10,
+                    max_lines=20,
+                    interactive=False
+                )
+        process_btn.click(
+            process_video,
+            inputs=video_input,
+            outputs=[process_status, transcript_output]
+        )
+        # Question answering section
+        with gr.Row():
+            gr.Markdown("## ❓ Step 3: Ask Questions")
+        with gr.Row():
+            with gr.Column():
+                question_input = gr.Textbox(
+                    label="Your Question",
+                    placeholder="What is this video about?",
+                    lines=2
+                )
+                ask_btn = gr.Button("Ask Question", variant="secondary")
+            with gr.Column():
+                answer_output = gr.Textbox(
+                    label="Answer",
+                    lines=5,
+                    interactive=False
+                )
+        ask_btn.click(
+            answer_question_interface,
+            inputs=[question_input, transcript_output],
+            outputs=answer_output
+        )
+        # Summarization section
+        with gr.Row():
+            gr.Markdown("## 📝 Step 4: Generate Summary")
+        with gr.Row():
+            with gr.Column():
+                summary_type = gr.Dropdown(
+                    choices=["general", "detailed", "bullet_points"],
+                    value="general",
+                    label="Summary Type"
+                )
+                summarize_btn = gr.Button("Generate Summary", variant="secondary")
+            with gr.Column():
+                summary_output = gr.Textbox(
+                    label="Summary",
+                    lines=8,
+                    interactive=False
+                )
+        summarize_btn.click(
+            summarize_interface,
+            inputs=[transcript_output, summary_type],
+            outputs=summary_output
+        )
+        # Instructions and tips
+        with gr.Row():
+            gr.Markdown("""
+            ## 💡 Tips:
+            1. **Supported formats**: MP4, AVI, MOV, MKV, and other common video formats
+            2. **Audio quality**: Better audio quality leads to more accurate transcriptions
+            3. **Processing time**: Larger videos take longer to process
+            4. **Questions**: Be specific with your questions for better answers
+            5. **Summaries**: Choose the summary type that best fits your needs
+            ## ⚠️ Requirements:
+            - PyTorch 2.6+ for FSDP2 support
+            - CUDA-compatible GPU recommended for optimal performance
+            - Sufficient disk space for temporary audio files
+            """)
+    return app
+# Launch the application
+if __name__ == "__main__":
+    app = create_interface()
+    app.launch(
+        share=True
+    )