import gradio as gr import cv2 from transformers import YolosImageProcessor, YolosForObjectDetection from PIL import Image import torch # Load model and processor model = YolosForObjectDetection.from_pretrained('hustvl/yolos-tiny') image_processor = YolosImageProcessor.from_pretrained("hustvl/yolos-tiny") def process_frame(frame): # Resize the frame to reduce processing time frame = cv2.resize(frame, (640, 360)) # downscaling the frame # Convert the frame (numpy array) to PIL image image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # Prepare the image for the model inputs = image_processor(images=image, return_tensors="pt") # Perform object detection with torch.no_grad(): outputs = model(**inputs) # Post-process the outputs to extract bounding boxes and labels target_sizes = torch.tensor([image.size[::-1]]) results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[0] # Draw the bounding boxes on the original frame for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): box = [round(i, 2) for i in box.tolist()] cv2.rectangle(frame, (int(box[0]), int(box[1])), (int(box[2]), int(box[3])), (0, 255, 0), 2) cv2.putText(frame, f"{model.config.id2label[label.item()]}: {round(score.item(), 2)}", (int(box[0]), int(box[1])-10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2) return frame def video_object_detection(video): cap = cv2.VideoCapture(video) processed_frames = [] while cap.isOpened(): ret, frame = cap.read() if not ret: break # Optionally skip frames to speed up processing # if int(cap.get(cv2.CAP_PROP_POS_FRAMES)) % 2 == 0: # Process every 2nd frame processed_frame = process_frame(frame) processed_frames.append(processed_frame) cap.release() # Convert processed frames to a video for display height, width, _ = processed_frames[0].shape output_video = cv2.VideoWriter('/tmp/output.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 20, (width, height)) for frame in processed_frames: output_video.write(frame) output_video.release() return '/tmp/output.mp4' # Create Gradio interface with live=True iface = gr.Interface(fn=video_object_detection, inputs="video", outputs="video", title="YOLOs-Tiny Video Detection", live=True) iface.launch()