Spaces:
Sleeping
Sleeping
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor | |
from qwen_vl_utils import process_vision_info | |
import torch | |
import uuid | |
from moviepy.editor import VideoFileClip | |
import os | |
import torch | |
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline | |
import cv2 | |
from ultralytics import YOLO | |
from heapq import heappush, heappushpop | |
import numpy as np | |
import uuid | |
import uuid | |
from ultralytics import YOLO | |
import gradio as gr | |
# # default: Load the model on the available device(s) | |
# model = Qwen2VLForConditionalGeneration.from_pretrained( | |
# "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto" | |
# ) | |
# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios. | |
model = Qwen2VLForConditionalGeneration.from_pretrained( | |
"Qwen/Qwen2-VL-7B-Instruct", | |
torch_dtype=torch.bfloat16, | |
attn_implementation="flash_attention_2", | |
device_map="auto", | |
) | |
# default processer | |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct") | |
# The default range for the number of visual tokens per image in the model is 4-16384. You can set min_pixels and max_pixels according to your needs, such as a token count range of 256-1280, to balance speed and memory usage. | |
# min_pixels = 256*28*28 | |
# max_pixels = 1280*28*28 | |
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels) | |
device = "cuda:0" if torch.cuda.is_available() else "cpu" | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
model_id = "openai/whisper-large-v3" | |
model_whisper = AutoModelForSpeechSeq2Seq.from_pretrained( | |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True | |
) | |
model_whisper.to(device) | |
processor_whisper = AutoProcessor.from_pretrained(model_id) | |
pipe = pipeline( | |
"automatic-speech-recognition", | |
model=model_whisper, | |
tokenizer=processor_whisper.tokenizer, | |
feature_extractor=processor_whisper.feature_extractor, | |
torch_dtype=torch_dtype, | |
device=device, | |
return_timestamps=True | |
) | |
output_directory = "temp" # Replace with your desired output directory | |
os.makedirs(output_directory, exist_ok=True) | |
def extract_audio(video_path): | |
try: | |
# Load the video file | |
video = VideoFileClip(video_path) | |
# Extract the audio | |
audio = video.audio | |
# Generate a unique filename using uuid | |
unique_filename = f"{uuid.uuid4()}.mp3" | |
audio_output_path = f"{output_directory}/{unique_filename}" | |
# Save the audio to the unique file | |
audio.write_audiofile(audio_output_path) | |
result = pipe(audio_output_path) | |
os.remove(audio_output_path) | |
return result["text"] | |
except Exception as e: | |
print(f"Error: {str(e)}") | |
return "" | |
output_dir = '/content/images' | |
model_yolo = YOLO('/model/best.pt') | |
def extract_top_weapon_frames(video_path, threshold=30): | |
os.makedirs(output_dir, exist_ok=True) | |
saved_paths = { | |
'original': [], # Paths for original frames | |
'boxed': [] # Paths for frames with boxes | |
} | |
weapon_classes = ['weapon', 'knife'] | |
top_frames = [] # (confidence_score, original_frame, boxed_frame, frame_number) | |
cap = cv2.VideoCapture(video_path) | |
if not cap.isOpened(): | |
print("Error: Could not open video.") | |
return saved_paths | |
ret, prev_frame = cap.read() | |
if not ret: | |
print("Error: Could not read the first frame.") | |
return saved_paths | |
prev_gray = cv2.cvtColor(prev_frame, cv2.COLOR_BGR2GRAY) | |
frame_number = 0 | |
while True: | |
ret, frame = cap.read() | |
if not ret: | |
break | |
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) | |
frame_diff = cv2.absdiff(gray, prev_gray) | |
mean_diff = frame_diff.mean() | |
if mean_diff > threshold: | |
print(f"Processing frame {frame_number}") | |
results = model_yolo.predict(source=frame, show=False) | |
frame_max_conf = 0 | |
frame_with_boxes = frame.copy() | |
for result in results: | |
for box in result.boxes: | |
class_id = int(box.cls[0]) | |
class_name = model_yolo.names[class_id] | |
confidence = float(box.conf[0]) | |
if class_name in weapon_classes: | |
frame_max_conf = max(frame_max_conf, confidence) | |
x1, y1, x2, y2 = map(int, box.xyxy[0]) | |
cv2.rectangle(frame_with_boxes, (x1, y1), (x2, y2), (0, 255, 0), 2) | |
label = f"{class_name} ({confidence:.2f})" | |
cv2.putText(frame_with_boxes, label, (x1, y1 - 10), | |
cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) | |
if frame_max_conf > 0: | |
if len(top_frames) < 2: | |
heappush(top_frames, (frame_max_conf, frame.copy(), frame_with_boxes, frame_number)) | |
elif frame_max_conf > top_frames[0][0]: | |
heappushpop(top_frames, (frame_max_conf, frame.copy(), frame_with_boxes, frame_number)) | |
prev_gray = gray | |
frame_number += 1 | |
# Save the top 2 frames (both original and with boxes) | |
for confidence, original_frame, boxed_frame, _ in sorted(top_frames, reverse=True): | |
# Save original frame | |
original_filename = f"{uuid.uuid4()}.jpg" | |
original_path = os.path.join(output_dir, original_filename) | |
cv2.imwrite(original_path, original_frame) | |
saved_paths['original'].append(original_path) | |
# Save frame with boxes | |
boxed_filename = f"{uuid.uuid4()}.jpg" | |
boxed_path = os.path.join(output_dir, boxed_filename) | |
cv2.imwrite(boxed_path, boxed_frame) | |
saved_paths['boxed'].append(boxed_path) | |
print(f"Saved frame pair with confidence {confidence:.3f}") | |
cap.release() | |
return saved_paths | |
def detect_weapon_image(source_image_path): | |
# Ensure the output directory exists | |
os.makedirs(output_dir, exist_ok=True) | |
# Run YOLO predictions | |
results = model_yolo.predict(source=source_image_path, save=False, show=False) | |
# List to store paths to saved images | |
saved_paths = [] | |
for result in results: | |
# Get the annotated image | |
annotated_img = result.plot() | |
# Generate a unique filename using UUID | |
unique_filename = f"{uuid.uuid4()}.jpg" | |
output_path = os.path.join(output_dir, unique_filename) | |
# Save the annotated image | |
cv2.imwrite(output_path, annotated_img) | |
saved_paths.append(output_path) | |
return saved_paths | |
def response(messages): | |
# Preparation for inference | |
text = processor.apply_chat_template( | |
messages, tokenize=False, add_generation_prompt=True | |
) | |
image_inputs, video_inputs = process_vision_info(messages) | |
inputs = processor( | |
text=[text], | |
images=image_inputs, | |
videos=video_inputs, | |
padding=True, | |
return_tensors="pt", | |
) | |
inputs = inputs.to("cuda") | |
# Inference: Generation of the output | |
generated_ids = model.generate(**inputs, max_new_tokens=1024) | |
generated_ids_trimmed = [ | |
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) | |
] | |
output_text = processor.batch_decode( | |
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
) | |
return output_text[0] | |
system_prompt = """ | |
Analyze the image for illegal items or contraband. Detect and categorize objects like guns, knives, drugs, and hidden compartments. Highlight areas of interest and provide: | |
1. A detailed explanation in Thai describing illegal items and their context. | |
2. A JSON output summarizing the findings. | |
Output Example: | |
1. Explanation (Thai): (detailed explanation in Thai describing illegal items and their context.) | |
2. JSON: [{"category": "weapon", "type": "gun"}] | |
""" | |
def is_mp4_file(file_path): | |
return os.path.isfile(file_path) and file_path.lower().endswith(".mp4") | |
def process_inputs(text_input, file_input): | |
if is_mp4_file(file_input): | |
extract_images_from_video = extract_top_weapon_frames(file_input) | |
transcription = extract_audio(file_input) | |
try: | |
# Prepare image content for messages | |
image_content = [] | |
# Check if we have any original images | |
if extract_images_from_video['original']: | |
# Add first image if available | |
image_content.append({ | |
"type": "image", | |
"image": f"file://{extract_images_from_video['original'][0]}" | |
}) | |
# Add second image if available | |
if len(extract_images_from_video['original']) > 1: | |
image_content.append({ | |
"type": "image", | |
"image": f"file://{extract_images_from_video['original'][1]}" | |
}) | |
# Create messages list with available content | |
messages = [{"role": "system", "content": system_prompt}, | |
{ | |
"role": "user", | |
"content": [ | |
*image_content, # Unpack available image content | |
{"type": "text", "text": f"Content From Social Media Post: {text_input}."}, | |
{"type": "text", "text": f"this is transcription from video:{transcription}"} | |
] | |
} | |
] | |
# Return response and available boxed images (empty list if none) | |
result = response(messages), extract_images_from_video.get('boxed', []) | |
return result | |
except Exception as e: | |
return f"Error: {str(e)}", [] | |
else: | |
try: | |
# Call your response function with text and file path | |
messages = [ {"role": "system", "content": system_prompt}, | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "image", | |
"image": f"file://{file_input}", | |
}, | |
{"type": "text", "text": f"Content From Social Media Post: {text_input}."}, | |
], | |
}] | |
result = response(messages) | |
detect_weapon = detect_weapon_image(file_input) | |
# Optionally, delete the temporary file after processing | |
return result,detect_weapon | |
except Exception as e: | |
# Handle any exceptions and return the error | |
return f"Error: {str(e)}",[] | |
# Create the Gradio interface | |
demo = gr.Interface( | |
fn=process_inputs, | |
inputs=[ | |
gr.Textbox( | |
label="Text Input", | |
placeholder="Enter your text here...", | |
lines=3 | |
), | |
gr.File( | |
label="File Upload", | |
file_types=[".mp4", ".png", ".jpeg",".jpg"], | |
type="filepath" | |
) | |
], | |
outputs= [gr.Textbox(label="Process Results", lines=8), | |
gr.Gallery(label="Generated images", show_label=False, elem_id="gallery", columns=[2], rows=[1], object_fit="contain", height="auto")], | |
title="Text and File Input Processor Qwen2-VL-7B-Instruct", | |
description="Enter text and/or upload a file to process them together", | |
) | |
if __name__ == "__main__": | |
demo.launch() |