shukdevdattaEX commited on
Commit
25e56df
Β·
verified Β·
1 Parent(s): c8a787f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +364 -0
app.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import tempfile
4
+ import subprocess
5
+ import librosa
6
+ import soundfile as sf
7
+ import torch
8
+ from pathlib import Path
9
+ import traceback
10
+ from typing import List, Dict, Tuple, Optional
11
+
12
+ # Install required packages
13
+ def install_requirements():
14
+ """Install required packages if not already installed"""
15
+ try:
16
+ import nemo
17
+ print("NeMo already installed")
18
+ except ImportError:
19
+ print("Installing NeMo...")
20
+ subprocess.run([
21
+ "pip", "install",
22
+ "nemo_toolkit[asr,tts] @ git+https://github.com/NVIDIA/NeMo.git"
23
+ ], check=True)
24
+
25
+ try:
26
+ import moviepy
27
+ print("MoviePy already installed")
28
+ except ImportError:
29
+ print("Installing MoviePy...")
30
+ subprocess.run(["pip", "install", "moviepy"], check=True)
31
+
32
+ # Try to install requirements
33
+ try:
34
+ install_requirements()
35
+ from nemo.collections.speechlm2.models import SALM
36
+ import moviepy.editor as mp
37
+ DEPENDENCIES_AVAILABLE = True
38
+ except Exception as e:
39
+ print(f"Warning: Could not install dependencies: {e}")
40
+ DEPENDENCIES_AVAILABLE = False
41
+
42
+ class VideoQASummarizer:
43
+ def __init__(self):
44
+ self.model = None
45
+ self.current_transcript = ""
46
+ self.model_loaded = False
47
+
48
+ def load_model(self):
49
+ """Load the Canary-Qwen-2.5B model"""
50
+ if not DEPENDENCIES_AVAILABLE:
51
+ return "Error: Required dependencies not available. Please install manually."
52
+
53
+ try:
54
+ if self.model is None:
55
+ print("Loading Canary-Qwen-2.5B model...")
56
+ self.model = SALM.from_pretrained('nvidia/canary-qwen-2.5b')
57
+ self.model_loaded = True
58
+ return "Model loaded successfully!"
59
+ return "Model already loaded."
60
+ except Exception as e:
61
+ error_msg = f"Error loading model: {str(e)}"
62
+ print(error_msg)
63
+ print(traceback.format_exc())
64
+ return error_msg
65
+
66
+ def extract_audio_from_video(self, video_path: str) -> str:
67
+ """Extract audio from video file"""
68
+ try:
69
+ # Create temporary audio file
70
+ temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
71
+ temp_audio_path = temp_audio.name
72
+ temp_audio.close()
73
+
74
+ # Load video and extract audio
75
+ video = mp.VideoFileClip(video_path)
76
+ audio = video.audio
77
+
78
+ # Write audio to temporary file
79
+ audio.write_audiofile(temp_audio_path, verbose=False, logger=None)
80
+
81
+ # Clean up
82
+ audio.close()
83
+ video.close()
84
+
85
+ return temp_audio_path
86
+ except Exception as e:
87
+ raise Exception(f"Error extracting audio: {str(e)}")
88
+
89
+ def preprocess_audio(self, audio_path: str) -> str:
90
+ """Preprocess audio for the model (ensure correct format)"""
91
+ try:
92
+ # Load audio
93
+ audio, sr = librosa.load(audio_path, sr=16000) # Resample to 16kHz if needed
94
+
95
+ # Create new temporary file for processed audio
96
+ temp_processed = tempfile.NamedTemporaryFile(delete=False, suffix='.wav')
97
+ temp_processed_path = temp_processed.name
98
+ temp_processed.close()
99
+
100
+ # Save processed audio
101
+ sf.write(temp_processed_path, audio, 16000)
102
+
103
+ return temp_processed_path
104
+ except Exception as e:
105
+ raise Exception(f"Error preprocessing audio: {str(e)}")
106
+
107
+ def transcribe_audio(self, audio_path: str) -> str:
108
+ """Transcribe audio using Canary-Qwen-2.5B in ASR mode"""
109
+ try:
110
+ if not self.model_loaded:
111
+ return "Error: Model not loaded. Please load the model first."
112
+
113
+ # Preprocess audio
114
+ processed_audio_path = self.preprocess_audio(audio_path)
115
+
116
+ # Transcribe using ASR mode
117
+ answer_ids = self.model.generate(
118
+ prompts=[
119
+ [{"role": "user", "content": f"Transcribe the following: {self.model.audio_locator_tag}", "audio": [processed_audio_path]}]
120
+ ],
121
+ max_new_tokens=512,
122
+ )
123
+
124
+ transcript = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
125
+
126
+ # Clean up temporary file
127
+ os.unlink(processed_audio_path)
128
+
129
+ return transcript.strip()
130
+ except Exception as e:
131
+ error_msg = f"Error during transcription: {str(e)}"
132
+ print(error_msg)
133
+ print(traceback.format_exc())
134
+ return error_msg
135
+
136
+ def answer_question(self, question: str, transcript: str) -> str:
137
+ """Answer questions about the transcript using LLM mode"""
138
+ try:
139
+ if not self.model_loaded:
140
+ return "Error: Model not loaded. Please load the model first."
141
+
142
+ if not transcript:
143
+ return "Error: No transcript available. Please transcribe a video first."
144
+
145
+ # Use LLM mode to answer questions
146
+ prompt = f"Based on the following transcript, please answer this question: {question}\n\nTranscript: {transcript}"
147
+
148
+ with self.model.llm.disable_adapter():
149
+ answer_ids = self.model.generate(
150
+ prompts=[[{"role": "user", "content": prompt}]],
151
+ max_new_tokens=512,
152
+ )
153
+
154
+ answer = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
155
+ return answer.strip()
156
+ except Exception as e:
157
+ error_msg = f"Error answering question: {str(e)}"
158
+ print(error_msg)
159
+ print(traceback.format_exc())
160
+ return error_msg
161
+
162
+ def summarize_transcript(self, transcript: str, summary_type: str = "general") -> str:
163
+ """Summarize the transcript using LLM mode"""
164
+ try:
165
+ if not self.model_loaded:
166
+ return "Error: Model not loaded. Please load the model first."
167
+
168
+ if not transcript:
169
+ return "Error: No transcript available. Please transcribe a video first."
170
+
171
+ # Create different summary prompts based on type
172
+ if summary_type == "bullet_points":
173
+ prompt = f"Please create a bullet-point summary of the key points from this transcript:\n\n{transcript}"
174
+ elif summary_type == "detailed":
175
+ prompt = f"Please provide a detailed summary of this transcript, including main topics and important details:\n\n{transcript}"
176
+ else: # general
177
+ prompt = f"Please provide a concise summary of this transcript:\n\n{transcript}"
178
+
179
+ with self.model.llm.disable_adapter():
180
+ answer_ids = self.model.generate(
181
+ prompts=[[{"role": "user", "content": prompt}]],
182
+ max_new_tokens=1024,
183
+ )
184
+
185
+ summary = self.model.tokenizer.ids_to_text(answer_ids[0].cpu())
186
+ return summary.strip()
187
+ except Exception as e:
188
+ error_msg = f"Error creating summary: {str(e)}"
189
+ print(error_msg)
190
+ print(traceback.format_exc())
191
+ return error_msg
192
+
193
+ # Initialize the model
194
+ qa_summarizer = VideoQASummarizer()
195
+
196
+ def load_model_interface():
197
+ """Interface function to load the model"""
198
+ return qa_summarizer.load_model()
199
+
200
+ def process_video(video_file):
201
+ """Process uploaded video and return transcript"""
202
+ if video_file is None:
203
+ return "Please upload a video file.", ""
204
+
205
+ try:
206
+ # Extract audio from video
207
+ status_msg = "Extracting audio from video..."
208
+ audio_path = qa_summarizer.extract_audio_from_video(video_file)
209
+
210
+ # Transcribe audio
211
+ status_msg = "Transcribing audio..."
212
+ transcript = qa_summarizer.transcribe_audio(audio_path)
213
+
214
+ # Store transcript for later use
215
+ qa_summarizer.current_transcript = transcript
216
+
217
+ # Clean up temporary audio file
218
+ if os.path.exists(audio_path):
219
+ os.unlink(audio_path)
220
+
221
+ return "Video processed successfully!", transcript
222
+ except Exception as e:
223
+ error_msg = f"Error processing video: {str(e)}"
224
+ print(error_msg)
225
+ print(traceback.format_exc())
226
+ return error_msg, ""
227
+
228
+ def answer_question_interface(question, transcript):
229
+ """Interface function to answer questions"""
230
+ if not question.strip():
231
+ return "Please enter a question."
232
+
233
+ return qa_summarizer.answer_question(question, transcript or qa_summarizer.current_transcript)
234
+
235
+ def summarize_interface(transcript, summary_type):
236
+ """Interface function to create summaries"""
237
+ return qa_summarizer.summarize_transcript(transcript or qa_summarizer.current_transcript, summary_type)
238
+
239
+ # Create Gradio interface
240
+ def create_interface():
241
+ with gr.Blocks(title="Video Q&A and Summarizer", theme=gr.themes.Soft()) as app:
242
+ gr.Markdown("""
243
+ # πŸŽ₯ Video Question Answering and Summarizer
244
+
245
+ Upload a video file to transcribe its audio content, then ask questions or generate summaries using NVIDIA's Canary-Qwen-2.5B model.
246
+
247
+ **Features:**
248
+ - Extract and transcribe audio from video files
249
+ - Ask questions about the video content
250
+ - Generate different types of summaries
251
+ - Powered by NVIDIA NeMo Canary-Qwen-2.5B
252
+ """)
253
+
254
+ # Model loading section
255
+ with gr.Row():
256
+ gr.Markdown("## πŸš€ Step 1: Load Model")
257
+
258
+ with gr.Row():
259
+ load_btn = gr.Button("Load Canary-Qwen-2.5B Model", variant="primary")
260
+ model_status = gr.Textbox(label="Model Status", interactive=False)
261
+
262
+ load_btn.click(load_model_interface, outputs=model_status)
263
+
264
+ # Video processing section
265
+ with gr.Row():
266
+ gr.Markdown("## πŸ“Ή Step 2: Upload and Process Video")
267
+
268
+ with gr.Row():
269
+ with gr.Column():
270
+ video_input = gr.Video(label="Upload Video File")
271
+ process_btn = gr.Button("Process Video", variant="primary")
272
+
273
+ with gr.Column():
274
+ process_status = gr.Textbox(label="Processing Status", interactive=False)
275
+ transcript_output = gr.Textbox(
276
+ label="Transcript",
277
+ lines=10,
278
+ max_lines=20,
279
+ interactive=False
280
+ )
281
+
282
+ process_btn.click(
283
+ process_video,
284
+ inputs=video_input,
285
+ outputs=[process_status, transcript_output]
286
+ )
287
+
288
+ # Question answering section
289
+ with gr.Row():
290
+ gr.Markdown("## ❓ Step 3: Ask Questions")
291
+
292
+ with gr.Row():
293
+ with gr.Column():
294
+ question_input = gr.Textbox(
295
+ label="Your Question",
296
+ placeholder="What is this video about?",
297
+ lines=2
298
+ )
299
+ ask_btn = gr.Button("Ask Question", variant="secondary")
300
+
301
+ with gr.Column():
302
+ answer_output = gr.Textbox(
303
+ label="Answer",
304
+ lines=5,
305
+ interactive=False
306
+ )
307
+
308
+ ask_btn.click(
309
+ answer_question_interface,
310
+ inputs=[question_input, transcript_output],
311
+ outputs=answer_output
312
+ )
313
+
314
+ # Summarization section
315
+ with gr.Row():
316
+ gr.Markdown("## πŸ“ Step 4: Generate Summary")
317
+
318
+ with gr.Row():
319
+ with gr.Column():
320
+ summary_type = gr.Dropdown(
321
+ choices=["general", "detailed", "bullet_points"],
322
+ value="general",
323
+ label="Summary Type"
324
+ )
325
+ summarize_btn = gr.Button("Generate Summary", variant="secondary")
326
+
327
+ with gr.Column():
328
+ summary_output = gr.Textbox(
329
+ label="Summary",
330
+ lines=8,
331
+ interactive=False
332
+ )
333
+
334
+ summarize_btn.click(
335
+ summarize_interface,
336
+ inputs=[transcript_output, summary_type],
337
+ outputs=summary_output
338
+ )
339
+
340
+ # Instructions and tips
341
+ with gr.Row():
342
+ gr.Markdown("""
343
+ ## πŸ’‘ Tips:
344
+
345
+ 1. **Supported formats**: MP4, AVI, MOV, MKV, and other common video formats
346
+ 2. **Audio quality**: Better audio quality leads to more accurate transcriptions
347
+ 3. **Processing time**: Larger videos take longer to process
348
+ 4. **Questions**: Be specific with your questions for better answers
349
+ 5. **Summaries**: Choose the summary type that best fits your needs
350
+
351
+ ## ⚠️ Requirements:
352
+ - PyTorch 2.6+ for FSDP2 support
353
+ - CUDA-compatible GPU recommended for optimal performance
354
+ - Sufficient disk space for temporary audio files
355
+ """)
356
+
357
+ return app
358
+
359
+ # Launch the application
360
+ if __name__ == "__main__":
361
+ app = create_interface()
362
+ app.launch(
363
+ share=True
364
+ )