KoonJamesZ commited on
Commit
604a2d4
·
verified ·
1 Parent(s): a36377d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +334 -0
app.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
2
+ from qwen_vl_utils import process_vision_info
3
+ import torch
4
+ import uuid
5
+ from moviepy.editor import VideoFileClip
6
+ import os
7
+ import torch
8
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
9
+ import cv2
10
+ from ultralytics import YOLO
11
+ from heapq import heappush, heappushpop
12
+ import numpy as np
13
+ import uuid
14
+ import uuid
15
+ from ultralytics import YOLO
16
+ import gradio as gr
17
+
18
+ # # default: Load the model on the available device(s)
19
+ # model = Qwen2VLForConditionalGeneration.from_pretrained(
20
+ # "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
21
+ # )
22
+
23
+ # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
24
+ model = Qwen2VLForConditionalGeneration.from_pretrained(
25
+ "Qwen/Qwen2-VL-7B-Instruct",
26
+ torch_dtype=torch.bfloat16,
27
+ attn_implementation="flash_attention_2",
28
+ device_map="auto",
29
+ )
30
+
31
+ # default processer
32
+ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
33
+
34
+ # The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage.
35
+ # min_pixels = 256*28*28
36
+ # max_pixels = 1280*28*28
37
+ # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)
38
+
39
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
40
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
41
+
42
+ model_id = "openai/whisper-large-v3"
43
+
44
+ model_whisper = AutoModelForSpeechSeq2Seq.from_pretrained(
45
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
46
+ )
47
+ model_whisper.to(device)
48
+
49
+ processor_whisper = AutoProcessor.from_pretrained(model_id)
50
+
51
+ pipe = pipeline(
52
+ "automatic-speech-recognition",
53
+ model=model_whisper,
54
+ tokenizer=processor_whisper.tokenizer,
55
+ feature_extractor=processor_whisper.feature_extractor,
56
+ torch_dtype=torch_dtype,
57
+ device=device,
58
+ return_timestamps=True
59
+ )
60
+
61
+ output_directory = "temp" # Replace with your desired output directory
62
+ os.makedirs(output_directory, exist_ok=True)
63
+
64
+ def extract_audio(video_path):
65
+ try:
66
+ # Load the video file
67
+ video = VideoFileClip(video_path)
68
+
69
+ # Extract the audio
70
+ audio = video.audio
71
+
72
+ # Generate a unique filename using uuid
73
+ unique_filename = f"{uuid.uuid4()}.mp3"
74
+ audio_output_path = f"{output_directory}/{unique_filename}"
75
+
76
+ # Save the audio to the unique file
77
+ audio.write_audiofile(audio_output_path)
78
+
79
+ result = pipe(audio_output_path)
80
+
81
+ os.remove(audio_output_path)
82
+
83
+ return result["text"]
84
+
85
+ except Exception as e:
86
+
87
+ print(f"Error: {str(e)}")
88
+
89
+ return ""
90
+
91
+ output_dir = '/content/images'
92
+ model_yolo = YOLO('/content/drive/MyDrive/CCIB-AI-YOLO/runs/detect/train/weights/best.pt')
93
+
94
+ def extract_top_weapon_frames(video_path, threshold=30):
95
+ os.makedirs(output_dir, exist_ok=True)
96
+ saved_paths = {
97
+ 'original': [], # Paths for original frames
98
+ 'boxed': [] # Paths for frames with boxes
99
+ }
100
+
101
+
102
+ weapon_classes = ['weapon', 'knife']
103
+ top_frames = [] # (confidence_score, original_frame, boxed_frame, frame_number)
104
+
105
+ cap = cv2.VideoCapture(video_path)
106
+ if not cap.isOpened():
107
+ print("Error: Could not open video.")
108
+ return saved_paths
109
+
110
+ ret, prev_frame = cap.read()
111
+ if not ret:
112
+ print("Error: Could not read the first frame.")
113
+ return saved_paths
114
+
115
+ prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY)
116
+ frame_number = 0
117
+
118
+ while True:
119
+ ret, frame = cap.read()
120
+ if not ret:
121
+ break
122
+
123
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
124
+ frame_diff = cv2.absdiff(gray, prev_gray)
125
+ mean_diff = frame_diff.mean()
126
+
127
+ if mean_diff > threshold:
128
+ print(f"Processing frame {frame_number}")
129
+ results = model_yolo.predict(source=frame, show=False)
130
+
131
+ frame_max_conf = 0
132
+ frame_with_boxes = frame.copy()
133
+
134
+ for result in results:
135
+ for box in result.boxes:
136
+ class_id = int(box.cls[0])
137
+ class_name = model_yolo.names[class_id]
138
+ confidence = float(box.conf[0])
139
+
140
+ if class_name in weapon_classes:
141
+ frame_max_conf = max(frame_max_conf, confidence)
142
+ x1, y1, x2, y2 = map(int, box.xyxy[0])
143
+ cv2.rectangle(frame_with_boxes, (x1, y1), (x2, y2), (0, 255, 0), 2)
144
+ label = f"{class_name} ({confidence:.2f})"
145
+ cv2.putText(frame_with_boxes, label, (x1, y1 - 10),
146
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
147
+
148
+ if frame_max_conf > 0:
149
+ if len(top_frames) < 2:
150
+ heappush(top_frames, (frame_max_conf, frame.copy(), frame_with_boxes, frame_number))
151
+ elif frame_max_conf > top_frames[0][0]:
152
+ heappushpop(top_frames, (frame_max_conf, frame.copy(), frame_with_boxes, frame_number))
153
+
154
+ prev_gray = gray
155
+ frame_number += 1
156
+
157
+ # Save the top 2 frames (both original and with boxes)
158
+ for confidence, original_frame, boxed_frame, _ in sorted(top_frames, reverse=True):
159
+ # Save original frame
160
+ original_filename = f"{uuid.uuid4()}.jpg"
161
+ original_path = os.path.join(output_dir, original_filename)
162
+ cv2.imwrite(original_path, original_frame)
163
+ saved_paths['original'].append(original_path)
164
+
165
+ # Save frame with boxes
166
+ boxed_filename = f"{uuid.uuid4()}.jpg"
167
+ boxed_path = os.path.join(output_dir, boxed_filename)
168
+ cv2.imwrite(boxed_path, boxed_frame)
169
+ saved_paths['boxed'].append(boxed_path)
170
+
171
+ print(f"Saved frame pair with confidence {confidence:.3f}")
172
+
173
+ cap.release()
174
+ return saved_paths
175
+
176
+ def detect_weapon_image(source_image_path):
177
+
178
+ # Ensure the output directory exists
179
+ os.makedirs(output_dir, exist_ok=True)
180
+
181
+ # Run YOLO predictions
182
+ results = model_yolo.predict(source=source_image_path, save=False, show=False)
183
+
184
+ # List to store paths to saved images
185
+ saved_paths = []
186
+
187
+ for result in results:
188
+ # Get the annotated image
189
+ annotated_img = result.plot()
190
+
191
+ # Generate a unique filename using UUID
192
+ unique_filename = f"{uuid.uuid4()}.jpg"
193
+ output_path = os.path.join(output_dir, unique_filename)
194
+
195
+ # Save the annotated image
196
+ cv2.imwrite(output_path, annotated_img)
197
+ saved_paths.append(output_path)
198
+
199
+ return saved_paths
200
+ def response(messages):
201
+ # Preparation for inference
202
+ text = processor.apply_chat_template(
203
+ messages, tokenize=False, add_generation_prompt=True
204
+ )
205
+ image_inputs, video_inputs = process_vision_info(messages)
206
+ inputs = processor(
207
+ text=[text],
208
+ images=image_inputs,
209
+ videos=video_inputs,
210
+ padding=True,
211
+ return_tensors="pt",
212
+ )
213
+ inputs = inputs.to("cuda")
214
+
215
+ # Inference: Generation of the output
216
+ generated_ids = model.generate(**inputs, max_new_tokens=1024)
217
+ generated_ids_trimmed = [
218
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
219
+ ]
220
+ output_text = processor.batch_decode(
221
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
222
+ )
223
+ return output_text[0]
224
+
225
+
226
+ system_prompt = """
227
+ Analyze the image for illegal items or contraband. Detect and categorize objects like guns, knives, drugs, and hidden compartments. Highlight areas of interest and provide:
228
+
229
+ 1. A detailed explanation in Thai describing illegal items and their context.
230
+ 2. A JSON output summarizing the findings.
231
+
232
+ Output Example:
233
+ 1. Explanation (Thai): (detailed explanation in Thai describing illegal items and their context.)
234
+ 2. JSON: [{"category": "weapon", "type": "gun"}]
235
+ """
236
+
237
+
238
+ def is_mp4_file(file_path):
239
+ return os.path.isfile(file_path) and file_path.lower().endswith(".mp4")
240
+
241
+ def process_inputs(text_input, file_input):
242
+
243
+ if is_mp4_file(file_input):
244
+ extract_images_from_video = extract_top_weapon_frames(file_input)
245
+ transcription = extract_audio(file_input)
246
+
247
+ try:
248
+ # Prepare image content for messages
249
+ image_content = []
250
+
251
+ # Check if we have any original images
252
+ if extract_images_from_video['original']:
253
+ # Add first image if available
254
+ image_content.append({
255
+ "type": "image",
256
+ "image": f"file://{extract_images_from_video['original'][0]}"
257
+ })
258
+
259
+ # Add second image if available
260
+ if len(extract_images_from_video['original']) > 1:
261
+ image_content.append({
262
+ "type": "image",
263
+ "image": f"file://{extract_images_from_video['original'][1]}"
264
+ })
265
+
266
+ # Create messages list with available content
267
+ messages = [{"role": "system", "content": system_prompt},
268
+ {
269
+ "role": "user",
270
+ "content": [
271
+ *image_content, # Unpack available image content
272
+ {"type": "text", "text": f"Content From Social Media Post: {text_input}."},
273
+ {"type": "text", "text": f"this is transcription from video:{transcription}"}
274
+ ]
275
+ }
276
+ ]
277
+
278
+ # Return response and available boxed images (empty list if none)
279
+ result = response(messages), extract_images_from_video.get('boxed', [])
280
+ return result
281
+
282
+ except Exception as e:
283
+ return f"Error: {str(e)}", []
284
+
285
+
286
+ else:
287
+ try:
288
+ # Call your response function with text and file path
289
+ messages = [ {"role": "system", "content": system_prompt},
290
+
291
+ {
292
+ "role": "user",
293
+ "content": [
294
+ {
295
+ "type": "image",
296
+ "image": f"file://{file_input}",
297
+ },
298
+ {"type": "text", "text": f"Content From Social Media Post: {text_input}."},
299
+ ],
300
+ }]
301
+
302
+ result = response(messages)
303
+ detect_weapon = detect_weapon_image(file_input)
304
+ # Optionally, delete the temporary file after processing
305
+
306
+ return result,detect_weapon
307
+ except Exception as e:
308
+ # Handle any exceptions and return the error
309
+ return f"Error: {str(e)}",[]
310
+
311
+ # Create the Gradio interface
312
+ demo = gr.Interface(
313
+ fn=process_inputs,
314
+ inputs=[
315
+ gr.Textbox(
316
+ label="Text Input",
317
+ placeholder="Enter your text here...",
318
+ lines=3
319
+ ),
320
+ gr.File(
321
+ label="File Upload",
322
+ file_types=[".mp4", ".png", ".jpeg",".jpg"],
323
+ type="filepath"
324
+ )
325
+ ],
326
+ outputs= [gr.Textbox(label="Process Results", lines=8),
327
+ gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[2], rows=[1], object_fit="contain", height="auto")],
328
+
329
+ title="Text and File Input Processor Qwen2-VL-7B-Instruct",
330
+ description="Enter text and/or upload a file to process them together",
331
+ )
332
+
333
+ if __name__ == "__main__":
334
+ demo.launch()