Spaces:

ustc-community
/

d-fine-object-detection

Running on Zero

App Files Files Community

qubvel-hf HF Staff commited on May 2

Commit

3fd48f0

1 Parent(s): 8102816

Fix GPU

Browse files

Files changed (1) hide show

app.py +68 -74

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ import torch
 import spaces
 import gradio as gr
 from pathlib import Path
 from functools import lru_cache
@@ -69,38 +70,43 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-@lru_cache(maxsize=3)
-def get_model_and_image_processor(checkpoint: str, device: str = "cpu"):
-    model = AutoModelForObjectDetection.from_pretrained(checkpoint, torch_dtype=TORCH_DTYPE).to(device)
-    image_processor = AutoImageProcessor.from_pretrained(checkpoint)
-    return model, image_processor
 @spaces.GPU(duration=20)
 def detect_objects(
     checkpoint: str,
-    images: Optional[List[Image.Image]] = None,
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
-    target_sizes: Optional[List[Tuple[int, int]]] = None,
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
-    model, image_processor = get_model_and_image_processor(checkpoint, device=device)
-    # preprocess images
-    inputs = image_processor(images=images, return_tensors="pt")
-    inputs = inputs.to(device).to(TORCH_DTYPE)
-    # forward pass
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # postprocess outputs
-    if not target_sizes:
-        target_sizes = [(image.height, image.width) for image in images]
-    results = image_processor.post_process_object_detection(
-        outputs, target_sizes=target_sizes, threshold=confidence_threshold
-    )
     return results, model.config.id2label
@@ -120,7 +126,7 @@ def process_image(
     results, id2label = detect_objects(
         checkpoint=checkpoint,
-        images=[image],
         confidence_threshold=confidence_threshold,
     )
     result = results[0] # first image in batch (we have batch size 1)
@@ -150,6 +156,25 @@ def get_target_size(image_height, image_width, max_size: int):
         new_height = int(image_height * max_size / image_width)
     return new_width, new_height
 def process_video(
     video_path: str,
     checkpoint: str,
@@ -164,69 +189,38 @@ def process_video(
     if ext not in ALLOWED_VIDEO_EXTENSIONS:
         raise ValueError(f"Unsupported video format: {ext}, supported formats: {ALLOWED_VIDEO_EXTENSIONS}")
-    cap = cv2.VideoCapture(video_path)
-    if not cap.isOpened():
-        raise ValueError(f"Failed to open video: {video_path}")
-    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
-    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
-    fps = cap.get(cv2.CAP_PROP_FPS)
-    num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
-    process_each_frame = fps // 25
-    target_fps = fps / process_each_frame
-    target_width, target_height = get_target_size(height, width, 1080)
     # Use H.264 codec for browser compatibility
-    fourcc = cv2.VideoWriter_fourcc(*"MJPG")
-    temp_file = tempfile.NamedTemporaryFile(suffix=".avi", delete=False)
     writer = cv2.VideoWriter(temp_file.name, fourcc, target_fps, (target_width, target_height))
     box_annotator = sv.BoxAnnotator(thickness=1)
     label_annotator = sv.LabelAnnotator(text_scale=0.5)
-    if not writer.isOpened():
-        cap.release()
-        temp_file.close()
-        os.unlink(temp_file.name)
-        raise ValueError("Failed to initialize video writer")
-    frames_to_process = int(min(MAX_NUM_FRAMES * process_each_frame, num_frames))
-    batch = []
-    for i in tqdm.tqdm(range(frames_to_process), desc="Processing video"):
-        ok, frame = cap.read()
-        if not ok:
-            break
-        if not i % process_each_frame == 0:
-            continue
-        if len(batch) < BATCH_SIZE:
-            frame = frame[:, :, ::-1].copy()  # BGR to RGB
-            batch.append(frame)
-            continue
-        results, id2label = detect_objects(
-            images=[Image.fromarray(frame) for frame in batch],
-            checkpoint=checkpoint,
-            confidence_threshold=confidence_threshold,
-            target_sizes=[(target_height, target_width)] * len(batch),
-        )
-        for frame, result in zip(batch, results):
-            frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)
-            detections = sv.Detections.from_transformers(result, id2label=id2label)
-            detections = detections.with_nms(threshold=0.95, class_agnostic=True)
-            annotated_frame = box_annotator.annotate(scene=frame, detections=detections)
-            annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections)
-            writer.write(cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
-        batch = []
     writer.release()
-    cap.release()
     temp_file.close()
     # Copy to persistent directory for Gradio access

 import spaces
 import gradio as gr
+import numpy as np
 from pathlib import Path
 from functools import lru_cache
 logger = logging.getLogger(__name__)
 @spaces.GPU(duration=20)
 def detect_objects(
     checkpoint: str,
+    images: List[np.ndarray],
     confidence_threshold: float = DEFAULT_CONFIDENCE_THRESHOLD,
+    target_size: Optional[Tuple[int, int]] = None,
+    batch_size: int = BATCH_SIZE,
 ):
     device = "cuda" if torch.cuda.is_available() else "cpu"
+    model = AutoModelForObjectDetection.from_pretrained(checkpoint, torch_dtype=TORCH_DTYPE).to(device)
+    image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+    batches = [images[i:i + batch_size] for i in range(0, len(images), batch_size)]
+    results = []
+    for batch in tqdm.tqdm(batches, desc="Processing frames"):
+        # preprocess images
+        inputs = image_processor(images=batch, return_tensors="pt")
+        inputs = inputs.to(device).to(TORCH_DTYPE)
+        # forward pass
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # postprocess outputs
+        if target_size:
+            target_sizes = [target_size] * len(batch)
+        else:
+            target_sizes = [(image.shape[0], image.shape[1]) for image in batch]
+        batch_results = image_processor.post_process_object_detection(
+            outputs, target_sizes=target_sizes, threshold=confidence_threshold
+        )
+        results.extend(batch_results)
     return results, model.config.id2label
     results, id2label = detect_objects(
         checkpoint=checkpoint,
+        images=[np.array(image)],
         confidence_threshold=confidence_threshold,
     )
     result = results[0] # first image in batch (we have batch size 1)
         new_height = int(image_height * max_size / image_width)
     return new_width, new_height
+def read_video_k_frames(video_path: str, k: int, read_every_i_frame: int = 1):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    i = 0
+    progress_bar = tqdm.tqdm(total=k, desc="Reading frames")
+    while cap.isOpened() and len(frames) < k:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        if i % read_every_i_frame == 0:
+            frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+            progress_bar.update(1)
+        i += 1
+    cap.release()
+    progress_bar.close()
+    return frames
 def process_video(
     video_path: str,
     checkpoint: str,
     if ext not in ALLOWED_VIDEO_EXTENSIONS:
         raise ValueError(f"Unsupported video format: {ext}, supported formats: {ALLOWED_VIDEO_EXTENSIONS}")
+    video_info = sv.VideoInfo.from_video_path(video_path)
+    read_each_i_frame = video_info.fps // 25
+    target_fps = video_info.fps / read_each_i_frame
+    target_width, target_height = get_target_size(video_info.height, video_info.width, 1080)
+    n_frames_to_read = min(MAX_NUM_FRAMES, video_info.total_frames // read_each_i_frame)
+    frames = read_video_k_frames(video_path, n_frames_to_read, read_each_i_frame)
     # Use H.264 codec for browser compatibility
+    fourcc = cv2.VideoWriter_fourcc(*"H264")
+    temp_file = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
     writer = cv2.VideoWriter(temp_file.name, fourcc, target_fps, (target_width, target_height))
     box_annotator = sv.BoxAnnotator(thickness=1)
     label_annotator = sv.LabelAnnotator(text_scale=0.5)
+    results, id2label = detect_objects(
+        images=frames,
+        checkpoint=checkpoint,
+        confidence_threshold=confidence_threshold,
+        target_size=(target_height, target_width),
+    )
+    for frame, result in tqdm.tqdm(zip(frames, results), desc="Annotating frames", total=len(frames)):
+        frame = cv2.resize(frame, (target_width, target_height), interpolation=cv2.INTER_AREA)
+        detections = sv.Detections.from_transformers(result, id2label=id2label)
+        detections = detections.with_nms(threshold=0.95, class_agnostic=True)
+        annotated_frame = box_annotator.annotate(scene=frame, detections=detections)
+        annotated_frame = label_annotator.annotate(scene=annotated_frame, detections=detections)
+        writer.write(cv2.cvtColor(annotated_frame, cv2.COLOR_RGB2BGR))
     writer.release()
     temp_file.close()
     # Copy to persistent directory for Gradio access