Spaces:

ankanpy
/

WildLife-Animal-Detection-using-RetinaNet

Sleeping

App Files Files Community

ankanpy commited on Mar 3

Commit

b959f6e

verified ·

1 Parent(s): 41fe7af

Upload 9 files

Browse files

Files changed (10) hide show

.gitattributes +4 -0
app.py +235 -0
config.py +25 -0
examples/buffalo.jpg +3 -0
examples/elephants.mp4 +3 -0
examples/rhino.mp4 +3 -0
examples/zebra.jpg +3 -0
model.py +33 -0
outputs/best_model_79.pth +3 -0
requirements.txt +5 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/buffalo.jpg filter=lfs diff=lfs merge=lfs -text
+examples/elephants.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/rhino.mp4 filter=lfs diff=lfs merge=lfs -text
+examples/zebra.jpg filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import os
+import cv2
+import time
+import torch
+import gradio as gr
+import numpy as np
+# Make sure these are your local imports from your project.
+from model import create_model
+from config import NUM_CLASSES, DEVICE, CLASSES
+# ----------------------------------------------------------------
+# GLOBAL SETUP
+# ----------------------------------------------------------------
+# Create the model and load the best weights.
+model = create_model(num_classes=NUM_CLASSES)
+checkpoint = torch.load("outputs/best_model_79.pth", map_location=DEVICE)
+model.load_state_dict(checkpoint["model_state_dict"])
+model.to(DEVICE).eval()
+# Create a colors array for each class index.
+# (length matches len(CLASSES), including background if you wish).
+COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))
+# COLORS = [
+#     (255, 255, 0),  # Cyan - background
+#     (50, 0, 255),  # Red - buffalo
+#     (147, 20, 255),  # Pink - elephant
+#     (0, 255, 0),  # Green - rhino
+#     (238, 130, 238),  # Violet - zebra
+# ]
+# ----------------------------------------------------------------
+# HELPER FUNCTIONS
+# ----------------------------------------------------------------
+def inference_on_image(orig_image: np.ndarray, resize_dim=None, threshold=0.25):
+    """
+    Runs inference on a single image (OpenCV BGR or NumPy array).
+    - resize_dim: if not None, we resize to (resize_dim, resize_dim)
+    - threshold: detection confidence threshold
+    Returns: processed image with bounding boxes drawn.
+    """
+    image = orig_image.copy()
+    # Optionally resize for inference.
+    if resize_dim is not None:
+        image = cv2.resize(image, (resize_dim, resize_dim))
+    # Convert BGR to RGB, normalize [0..1]
+    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).astype(np.float32) / 255.0
+    # Move channels to front (C,H,W)
+    image_tensor = torch.tensor(image_rgb.transpose(2, 0, 1), dtype=torch.float).unsqueeze(0).to(DEVICE)
+    start_time = time.time()
+    # Inference
+    with torch.no_grad():
+        outputs = model(image_tensor)
+    end_time = time.time()
+    # Get the current fps.
+    fps = 1 / (end_time - start_time)
+    fps_text = f"FPS: {fps:.2f}"
+    # Move outputs to CPU numpy
+    outputs = [{k: v.cpu() for k, v in t.items()} for t in outputs]
+    boxes = outputs[0]["boxes"].numpy()
+    scores = outputs[0]["scores"].numpy()
+    labels = outputs[0]["labels"].numpy().astype(int)
+    # Filter out boxes with low confidence
+    valid_idx = np.where(scores >= threshold)[0]
+    boxes = boxes[valid_idx].astype(int)
+    labels = labels[valid_idx]
+    # If we resized for inference, rescale boxes back to orig_image size
+    if resize_dim is not None:
+        h_orig, w_orig = orig_image.shape[:2]
+        h_new, w_new = resize_dim, resize_dim
+        # scale boxes
+        boxes[:, [0, 2]] = (boxes[:, [0, 2]] / w_new) * w_orig
+        boxes[:, [1, 3]] = (boxes[:, [1, 3]] / h_new) * h_orig
+    # Draw bounding boxes
+    for box, label_idx in zip(boxes, labels):
+        class_name = CLASSES[label_idx] if 0 <= label_idx < len(CLASSES) else str(label_idx)
+        color = COLORS[label_idx % len(COLORS)][::-1]  # BGR color
+        cv2.rectangle(orig_image, (box[0], box[1]), (box[2], box[3]), color, 5)
+        cv2.putText(orig_image, class_name, (box[0], box[1] - 5), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (0,0,0), 3)
+        cv2.putText(
+            orig_image,
+            fps_text,
+            (int((w_orig / 2) - 50), 30),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.8,
+            (0, 255, 0),
+            2,
+            cv2.LINE_AA,
+        )
+    return orig_image, fps
+def inference_on_frame(frame: np.ndarray, resize_dim=None, threshold=0.25):
+    """
+    Same as inference_on_image but for a single video frame.
+    Returns the processed frame with bounding boxes.
+    """
+    return inference_on_image(frame, resize_dim, threshold)
+# ----------------------------------------------------------------
+# GRADIO FUNCTIONS
+# ----------------------------------------------------------------
+def img_inf(image_path, resize_dim, threshold):
+    """
+    Gradio function for image inference.
+    :param image_path: File path from Gradio (uploaded image).
+    :param model_name: Selected model from Radio (not used if only one model).
+    Returns: A NumPy image array with bounding boxes.
+    """
+    if image_path is None:
+        return None  # No image provided
+    orig_image = cv2.imread(image_path)  # BGR
+    if orig_image is None:
+        return None  # Error reading image
+    result_image, _ = inference_on_image(orig_image, resize_dim=resize_dim, threshold=threshold)
+    # Return the image in RGB for Gradio's display
+    result_image_rgb = cv2.cvtColor(result_image, cv2.COLOR_BGR2RGB)
+    return result_image_rgb
+def vid_inf(video_path, resize_dim, threshold):
+    """
+    Gradio function for video inference.
+    Processes each frame, draws bounding boxes, and writes to an output video.
+    Returns: (last_processed_frame, output_video_file_path)
+    """
+    if video_path is None:
+        return None, None  # No video provided
+    # Prepare input capture
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        return None, None
+    # Create an output file path
+    os.makedirs("inference_outputs/videos", exist_ok=True)
+    out_video_path = os.path.join("inference_outputs/videos", "video_output.mp4")
+    # out_video_path = "video_output.mp4"
+    # Get video properties
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")  # or 'XVID'
+    # If FPS is 0 (some weird container), default to something
+    if fps <= 0:
+        fps = 20.0
+    out_writer = cv2.VideoWriter(out_video_path, fourcc, fps, (width, height))
+    frame_count = 0
+    total_fps = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+        # Inference on frame
+        processed_frame, frame_fps = inference_on_frame(frame, resize_dim=resize_dim, threshold=threshold)
+        total_fps += frame_fps
+        frame_count += 1
+        # Write the processed frame
+        out_writer.write(processed_frame)
+        yield cv2.cvtColor(processed_frame, cv2.COLOR_BGR2RGB), None
+    avg_fps = total_fps / frame_count
+    cap.release()
+    out_writer.release()
+    print(f"Average FPS: {avg_fps:.3f}")
+    yield None, out_video_path
+# ----------------------------------------------------------------
+# BUILD THE GRADIO INTERFACES
+# ----------------------------------------------------------------
+# For demonstration, we define two possible model radio choices.
+# You can ignore or expand this if you only use RetinaNet.
+resize_dim = gr.Slider(100, 1024, value=640, label="Resize Dimension", info="Resize image to this dimension")
+threshold = gr.Slider(0, 1, value=0.5, label="Threshold", info="Confidence threshold for detection")
+inputs_image = gr.Image(type="filepath", label="Input Image")
+outputs_image = gr.Image(type="numpy", label="Output Image")
+interface_image = gr.Interface(
+    fn=img_inf,
+    inputs=[inputs_image, resize_dim, threshold],
+    outputs=outputs_image,
+    title="Image Inference",
+    description="Upload your photo, select a model, and see the results!",
+    examples=[["examples/buffalo.jpg"], ["examples/zebra.jpg"]],
+    cache_examples=False,
+)
+resize_dim = gr.Slider(100, 1024, value=640, label="Resize Dimension", info="Resize image to this dimension")
+threshold = gr.Slider(0, 1, value=0.5, label="Threshold", info="Confidence threshold for detection")
+input_video = gr.Video(label="Input Video")
+# Output is a pair: (last_processed_frame, output_video_path)
+output_frame = gr.Image(type="numpy", label="Output (Last Processed Frame)")
+output_video_file = gr.Video(format="mp4", label="Output Video")
+interface_video = gr.Interface(
+    fn=vid_inf,
+    inputs=[input_video, resize_dim, threshold],
+    outputs=[output_frame, output_video_file],
+    title="Video Inference",
+    description="Upload your video and see the processed output!",
+    examples=[["examples/elephants.mp4"], ["examples/rhino.mp4"]],
+    cache_examples=False,
+)
+# Combine them in a Tabbed Interface
+demo = (
+    gr.TabbedInterface(
+        [interface_image, interface_video],
+        tab_names=["Image", "Video"],
+        title="FineTuning RetinaNet for Wildlife Animal Detection",
+        theme="gstaff/xkcd",
+    )
+    .queue()
+    .launch()
+)

config.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import torch
+BATCH_SIZE = 8  # Increase / decrease according to GPU memeory.
+RESIZE_TO = 640  # Resize the image for training and transforms.
+NUM_EPOCHS = 60  # Number of epochs to train for.
+NUM_WORKERS = 4  # Number of parallel workers for data loading.
+DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+# Training images and labels files directory.
+TRAIN_DIR = "data/train"
+# Validation images and labels files directory.
+VALID_DIR = "data/valid"
+# Classes: 0 index is reserved for background.
+CLASSES = ["__background__", "buffalo", "elephant", "rhino", "zebra"]
+NUM_CLASSES = len(CLASSES)
+# Whether to visualize images after crearing the data loaders.
+VISUALIZE_TRANSFORMED_IMAGES = True
+# Location to save model and plots.
+OUT_DIR = "outputs"

examples/buffalo.jpg ADDED Viewed

Git LFS Details

SHA256: 183a9c85a77a44ff66ab80bdb3ccdb32d34b89e8e089865e67496dfb08e7443a
Pointer size: 131 Bytes
Size of remote file: 324 kB

examples/elephants.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7cd66a941d8883505826e1b191c6c45f21a2f9cad05301f1ce62da676b431a3
+size 3617117

examples/rhino.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c151ff03aa5a01c4604ccc7cca9dd1518eb8faf807a320001f7efb2598effcef
+size 9404729

examples/zebra.jpg ADDED Viewed

Git LFS Details

SHA256: 5f2610182725e882f3033ef7c763b2eb04641df44e1e4554f587ff5faa09cb0c
Pointer size: 131 Bytes
Size of remote file: 121 kB

model.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import torchvision
+import torch
+from functools import partial
+from torchvision.models.detection import RetinaNet_ResNet50_FPN_V2_Weights
+from torchvision.models.detection.retinanet import RetinaNetClassificationHead
+from config import NUM_CLASSES
+def create_model(num_classes=91):
+    """
+    Creates a RetinaNet-ResNet50-FPN v2 model pre-trained on COCO.
+    Replaces the classification head for the required number of classes.
+    """
+    model = torchvision.models.detection.retinanet_resnet50_fpn_v2(weights=RetinaNet_ResNet50_FPN_V2_Weights.COCO_V1)
+    num_anchors = model.head.classification_head.num_anchors
+    # Replace the classification head
+    model.head.classification_head = RetinaNetClassificationHead(
+        in_channels=256, num_anchors=num_anchors, num_classes=num_classes, norm_layer=partial(torch.nn.GroupNorm, 32)
+    )
+    return model
+if __name__ == "__main__":
+    model = create_model(num_classes=NUM_CLASSES)
+    print(model)
+    # Total parameters:
+    total_params = sum(p.numel() for p in model.parameters())
+    print(f"{total_params:,} total parameters.")
+    # Trainable parameters:
+    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    print(f"{total_trainable_params:,} training parameters.")

outputs/best_model_79.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82a9d5634acb4adeecdf8417cae2d26c6389ca7279472bc65fbc907a28300047
+size 146001704

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+opencv-python==4.11.0.86
+torch==2.6.0
+torchvision==0.21.0
+torchaudio==2.6.0
+gradio==5.18.0