Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import os
|
3 |
+
import tempfile
|
4 |
+
import subprocess
|
5 |
+
import librosa
|
6 |
+
import soundfile as sf
|
7 |
+
import torch
|
8 |
+
from pathlib import Path
|
9 |
+
import traceback
|
10 |
+
from typing import List, Dict, Tuple, Optional
|
11 |
+
|
12 |
+
# Install required packages
|
13 |
+
def install_requirements():
|
14 |
+
"""Install required packages if not already installed"""
|
15 |
+
try:
|
16 |
+
import nemo
|
17 |
+
print("NeMo already installed")
|
18 |
+
except ImportError:
|
19 |
+
print("Installing NeMo...")
|
20 |
+
subprocess.run([
|
21 |
+
"pip", "install",
|
22 |
+
"nemo_toolkit[asr,tts] @ git+https://github.com/NVIDIA/NeMo.git"
|
23 |
+
], check=True)
|
24 |
+
|
25 |
+
try:
|
26 |
+
import moviepy
|
27 |
+
print("MoviePy already installed")
|
28 |
+
except ImportError:
|
29 |
+
print("Installing MoviePy...")
|
30 |
+
subprocess.run(["pip", "install", "moviepy"], check=True)
|
31 |
+
|
32 |
+
# Try to install requirements
|
33 |
+
try:
|
34 |
+
install_requirements()
|
35 |
+
from nemo.collections.speechlm2.models import SALM
|
36 |
+
import moviepy.editor as mp
|
37 |
+
DEPENDENCIES_AVAILABLE = True
|
38 |
+
except Exception as e:
|
39 |
+
print(f"Warning: Could not install dependencies: {e}")
|
40 |
+
DEPENDENCIES_AVAILABLE = False
|
41 |
+
|
42 |
+
class VideoQASummarizer:
|
43 |
+
def __init__(self):
|
44 |
+
self.model = None
|
45 |
+
self.current_transcript = ""
|
46 |
+
self.model_loaded = False
|
47 |
+
|
48 |
+
def load_model(self):
|
49 |
+
"""Load the Canary-Qwen-2.5B model"""
|
50 |
+
if not DEPENDENCIES_AVAILABLE:
|
51 |
+
return "Error: Required dependencies not available. Please install manually."
|
52 |
+
|
53 |
+
try:
|
54 |
+
if self.model is None:
|
55 |
+
print("Loading Canary-Qwen-2.5B model...")
|
56 |
+
self.model = SALM.from_pretrained('nvidia/canary-qwen-2.5b')
|
57 |
+
self.model_loaded = True
|
58 |
+
return "Model loaded successfully!"
|
59 |
+
return "Model already loaded."
|
60 |
+
except Exception as e:
|
61 |
+
error_msg = f"Error loading model: {str(e)}"
|
62 |
+
print(error_msg)
|
63 |
+
print(traceback.format_exc())
|
64 |
+
return error_msg
|
65 |
+
|
66 |
+
def extract_audio_from_video(self, video_path: str) -> str:
|
67 |
+
"""Extract audio from video file"""
|
68 |
+
try:
|
69 |
+
# Create temporary audio file
|
70 |
+
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
71 |
+
temp_audio_path = temp_audio.name
|
72 |
+
temp_audio.close()
|
73 |
+
|
74 |
+
# Load video and extract audio
|
75 |
+
video = mp.VideoFileClip(video_path)
|
76 |
+
audio = video.audio
|
77 |
+
|
78 |
+
# Write audio to temporary file
|
79 |
+
audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
|
80 |
+
|
81 |
+
# Clean up
|
82 |
+
audio.close()
|
83 |
+
video.close()
|
84 |
+
|
85 |
+
return temp_audio_path
|
86 |
+
except Exception as e:
|
87 |
+
raise Exception(f"Error extracting audio: {str(e)}")
|
88 |
+
|
89 |
+
def preprocess_audio(self, audio_path: str) -> str:
|
90 |
+
"""Preprocess audio for the model (ensure correct format)"""
|
91 |
+
try:
|
92 |
+
# Load audio
|
93 |
+
audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz if needed
|
94 |
+
|
95 |
+
# Create new temporary file for processed audio
|
96 |
+
temp_processed = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
|
97 |
+
temp_processed_path = temp_processed.name
|
98 |
+
temp_processed.close()
|
99 |
+
|
100 |
+
# Save processed audio
|
101 |
+
sf.write(temp_processed_path, audio, 16000)
|
102 |
+
|
103 |
+
return temp_processed_path
|
104 |
+
except Exception as e:
|
105 |
+
raise Exception(f"Error preprocessing audio: {str(e)}")
|
106 |
+
|
107 |
+
def transcribe_audio(self, audio_path: str) -> str:
|
108 |
+
"""Transcribe audio using Canary-Qwen-2.5B in ASR mode"""
|
109 |
+
try:
|
110 |
+
if not self.model_loaded:
|
111 |
+
return "Error: Model not loaded. Please load the model first."
|
112 |
+
|
113 |
+
# Preprocess audio
|
114 |
+
processed_audio_path = self.preprocess_audio(audio_path)
|
115 |
+
|
116 |
+
# Transcribe using ASR mode
|
117 |
+
answer_ids = self.model.generate(
|
118 |
+
prompts=[
|
119 |
+
[{"role": "user", "content": f"Transcribe the following: {self.model.audio_locator_tag}", "audio": [processed_audio_path]}]
|
120 |
+
],
|
121 |
+
max_new_tokens=512,
|
122 |
+
)
|
123 |
+
|
124 |
+
transcript = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
|
125 |
+
|
126 |
+
# Clean up temporary file
|
127 |
+
os.unlink(processed_audio_path)
|
128 |
+
|
129 |
+
return transcript.strip()
|
130 |
+
except Exception as e:
|
131 |
+
error_msg = f"Error during transcription: {str(e)}"
|
132 |
+
print(error_msg)
|
133 |
+
print(traceback.format_exc())
|
134 |
+
return error_msg
|
135 |
+
|
136 |
+
def answer_question(self, question: str, transcript: str) -> str:
|
137 |
+
"""Answer questions about the transcript using LLM mode"""
|
138 |
+
try:
|
139 |
+
if not self.model_loaded:
|
140 |
+
return "Error: Model not loaded. Please load the model first."
|
141 |
+
|
142 |
+
if not transcript:
|
143 |
+
return "Error: No transcript available. Please transcribe a video first."
|
144 |
+
|
145 |
+
# Use LLM mode to answer questions
|
146 |
+
prompt = f"Based on the following transcript, please answer this question: {question}\n\nTranscript: {transcript}"
|
147 |
+
|
148 |
+
with self.model.llm.disable_adapter():
|
149 |
+
answer_ids = self.model.generate(
|
150 |
+
prompts=[[{"role": "user", "content": prompt}]],
|
151 |
+
max_new_tokens=512,
|
152 |
+
)
|
153 |
+
|
154 |
+
answer = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
|
155 |
+
return answer.strip()
|
156 |
+
except Exception as e:
|
157 |
+
error_msg = f"Error answering question: {str(e)}"
|
158 |
+
print(error_msg)
|
159 |
+
print(traceback.format_exc())
|
160 |
+
return error_msg
|
161 |
+
|
162 |
+
def summarize_transcript(self, transcript: str, summary_type: str = "general") -> str:
|
163 |
+
"""Summarize the transcript using LLM mode"""
|
164 |
+
try:
|
165 |
+
if not self.model_loaded:
|
166 |
+
return "Error: Model not loaded. Please load the model first."
|
167 |
+
|
168 |
+
if not transcript:
|
169 |
+
return "Error: No transcript available. Please transcribe a video first."
|
170 |
+
|
171 |
+
# Create different summary prompts based on type
|
172 |
+
if summary_type == "bullet_points":
|
173 |
+
prompt = f"Please create a bullet-point summary of the key points from this transcript:\n\n{transcript}"
|
174 |
+
elif summary_type == "detailed":
|
175 |
+
prompt = f"Please provide a detailed summary of this transcript, including main topics and important details:\n\n{transcript}"
|
176 |
+
else: # general
|
177 |
+
prompt = f"Please provide a concise summary of this transcript:\n\n{transcript}"
|
178 |
+
|
179 |
+
with self.model.llm.disable_adapter():
|
180 |
+
answer_ids = self.model.generate(
|
181 |
+
prompts=[[{"role": "user", "content": prompt}]],
|
182 |
+
max_new_tokens=1024,
|
183 |
+
)
|
184 |
+
|
185 |
+
summary = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
|
186 |
+
return summary.strip()
|
187 |
+
except Exception as e:
|
188 |
+
error_msg = f"Error creating summary: {str(e)}"
|
189 |
+
print(error_msg)
|
190 |
+
print(traceback.format_exc())
|
191 |
+
return error_msg
|
192 |
+
|
193 |
+
# Initialize the model
|
194 |
+
qa_summarizer = VideoQASummarizer()
|
195 |
+
|
196 |
+
def load_model_interface():
|
197 |
+
"""Interface function to load the model"""
|
198 |
+
return qa_summarizer.load_model()
|
199 |
+
|
200 |
+
def process_video(video_file):
|
201 |
+
"""Process uploaded video and return transcript"""
|
202 |
+
if video_file is None:
|
203 |
+
return "Please upload a video file.", ""
|
204 |
+
|
205 |
+
try:
|
206 |
+
# Extract audio from video
|
207 |
+
status_msg = "Extracting audio from video..."
|
208 |
+
audio_path = qa_summarizer.extract_audio_from_video(video_file)
|
209 |
+
|
210 |
+
# Transcribe audio
|
211 |
+
status_msg = "Transcribing audio..."
|
212 |
+
transcript = qa_summarizer.transcribe_audio(audio_path)
|
213 |
+
|
214 |
+
# Store transcript for later use
|
215 |
+
qa_summarizer.current_transcript = transcript
|
216 |
+
|
217 |
+
# Clean up temporary audio file
|
218 |
+
if os.path.exists(audio_path):
|
219 |
+
os.unlink(audio_path)
|
220 |
+
|
221 |
+
return "Video processed successfully!", transcript
|
222 |
+
except Exception as e:
|
223 |
+
error_msg = f"Error processing video: {str(e)}"
|
224 |
+
print(error_msg)
|
225 |
+
print(traceback.format_exc())
|
226 |
+
return error_msg, ""
|
227 |
+
|
228 |
+
def answer_question_interface(question, transcript):
|
229 |
+
"""Interface function to answer questions"""
|
230 |
+
if not question.strip():
|
231 |
+
return "Please enter a question."
|
232 |
+
|
233 |
+
return qa_summarizer.answer_question(question, transcript or qa_summarizer.current_transcript)
|
234 |
+
|
235 |
+
def summarize_interface(transcript, summary_type):
|
236 |
+
"""Interface function to create summaries"""
|
237 |
+
return qa_summarizer.summarize_transcript(transcript or qa_summarizer.current_transcript, summary_type)
|
238 |
+
|
239 |
+
# Create Gradio interface
|
240 |
+
def create_interface():
|
241 |
+
with gr.Blocks(title="Video Q&A and Summarizer", theme=gr.themes.Soft()) as app:
|
242 |
+
gr.Markdown("""
|
243 |
+
# π₯ Video Question Answering and Summarizer
|
244 |
+
|
245 |
+
Upload a video file to transcribe its audio content, then ask questions or generate summaries using NVIDIA's Canary-Qwen-2.5B model.
|
246 |
+
|
247 |
+
**Features:**
|
248 |
+
- Extract and transcribe audio from video files
|
249 |
+
- Ask questions about the video content
|
250 |
+
- Generate different types of summaries
|
251 |
+
- Powered by NVIDIA NeMo Canary-Qwen-2.5B
|
252 |
+
""")
|
253 |
+
|
254 |
+
# Model loading section
|
255 |
+
with gr.Row():
|
256 |
+
gr.Markdown("## π Step 1: Load Model")
|
257 |
+
|
258 |
+
with gr.Row():
|
259 |
+
load_btn = gr.Button("Load Canary-Qwen-2.5B Model", variant="primary")
|
260 |
+
model_status = gr.Textbox(label="Model Status", interactive=False)
|
261 |
+
|
262 |
+
load_btn.click(load_model_interface, outputs=model_status)
|
263 |
+
|
264 |
+
# Video processing section
|
265 |
+
with gr.Row():
|
266 |
+
gr.Markdown("## πΉ Step 2: Upload and Process Video")
|
267 |
+
|
268 |
+
with gr.Row():
|
269 |
+
with gr.Column():
|
270 |
+
video_input = gr.Video(label="Upload Video File")
|
271 |
+
process_btn = gr.Button("Process Video", variant="primary")
|
272 |
+
|
273 |
+
with gr.Column():
|
274 |
+
process_status = gr.Textbox(label="Processing Status", interactive=False)
|
275 |
+
transcript_output = gr.Textbox(
|
276 |
+
label="Transcript",
|
277 |
+
lines=10,
|
278 |
+
max_lines=20,
|
279 |
+
interactive=False
|
280 |
+
)
|
281 |
+
|
282 |
+
process_btn.click(
|
283 |
+
process_video,
|
284 |
+
inputs=video_input,
|
285 |
+
outputs=[process_status, transcript_output]
|
286 |
+
)
|
287 |
+
|
288 |
+
# Question answering section
|
289 |
+
with gr.Row():
|
290 |
+
gr.Markdown("## β Step 3: Ask Questions")
|
291 |
+
|
292 |
+
with gr.Row():
|
293 |
+
with gr.Column():
|
294 |
+
question_input = gr.Textbox(
|
295 |
+
label="Your Question",
|
296 |
+
placeholder="What is this video about?",
|
297 |
+
lines=2
|
298 |
+
)
|
299 |
+
ask_btn = gr.Button("Ask Question", variant="secondary")
|
300 |
+
|
301 |
+
with gr.Column():
|
302 |
+
answer_output = gr.Textbox(
|
303 |
+
label="Answer",
|
304 |
+
lines=5,
|
305 |
+
interactive=False
|
306 |
+
)
|
307 |
+
|
308 |
+
ask_btn.click(
|
309 |
+
answer_question_interface,
|
310 |
+
inputs=[question_input, transcript_output],
|
311 |
+
outputs=answer_output
|
312 |
+
)
|
313 |
+
|
314 |
+
# Summarization section
|
315 |
+
with gr.Row():
|
316 |
+
gr.Markdown("## π Step 4: Generate Summary")
|
317 |
+
|
318 |
+
with gr.Row():
|
319 |
+
with gr.Column():
|
320 |
+
summary_type = gr.Dropdown(
|
321 |
+
choices=["general", "detailed", "bullet_points"],
|
322 |
+
value="general",
|
323 |
+
label="Summary Type"
|
324 |
+
)
|
325 |
+
summarize_btn = gr.Button("Generate Summary", variant="secondary")
|
326 |
+
|
327 |
+
with gr.Column():
|
328 |
+
summary_output = gr.Textbox(
|
329 |
+
label="Summary",
|
330 |
+
lines=8,
|
331 |
+
interactive=False
|
332 |
+
)
|
333 |
+
|
334 |
+
summarize_btn.click(
|
335 |
+
summarize_interface,
|
336 |
+
inputs=[transcript_output, summary_type],
|
337 |
+
outputs=summary_output
|
338 |
+
)
|
339 |
+
|
340 |
+
# Instructions and tips
|
341 |
+
with gr.Row():
|
342 |
+
gr.Markdown("""
|
343 |
+
## π‘ Tips:
|
344 |
+
|
345 |
+
1. **Supported formats**: MP4, AVI, MOV, MKV, and other common video formats
|
346 |
+
2. **Audio quality**: Better audio quality leads to more accurate transcriptions
|
347 |
+
3. **Processing time**: Larger videos take longer to process
|
348 |
+
4. **Questions**: Be specific with your questions for better answers
|
349 |
+
5. **Summaries**: Choose the summary type that best fits your needs
|
350 |
+
|
351 |
+
## β οΈ Requirements:
|
352 |
+
- PyTorch 2.6+ for FSDP2 support
|
353 |
+
- CUDA-compatible GPU recommended for optimal performance
|
354 |
+
- Sufficient disk space for temporary audio files
|
355 |
+
""")
|
356 |
+
|
357 |
+
return app
|
358 |
+
|
359 |
+
# Launch the application
|
360 |
+
if __name__ == "__main__":
|
361 |
+
app = create_interface()
|
362 |
+
app.launch(
|
363 |
+
share=True
|
364 |
+
)
|