File size: 4,206 Bytes
eee0e91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import cv2
import torch
from PIL import Image, ImageDraw
import gradio as gr
import numpy as np
import pandas as pd
from transformers import pipeline

# Load the YOLOv5 model
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

# Load the translation model
translator = pipeline("translation_en_to_ar", model="Helsinki-NLP/opus-mt-en-ar")

# Define a function to detect objects and draw bounding boxes for images
def detect_and_draw_image(input_image):
    results = model(input_image)
    detections = results.xyxy[0].numpy()

    draw = ImageDraw.Draw(input_image)

    counts = {}
    for detection in detections:
        xmin, ymin, xmax, ymax, conf, class_id = detection

        # Update counts for each label
        label = model.names[int(class_id)]
        counts[label] = counts.get(label, 0) + 1

        # Draw the bounding box
        draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red", width=2)
        draw.text((xmin, ymin), f"{label}: {conf:.2f}", fill="white")

    # Translate counts to Arabic
    translated_counts = translator(list(counts.keys()))

    df = pd.DataFrame({
        'label (English)': list(counts.keys()),
        'label (Arabic)': [t['translation_text'] for t in translated_counts],
        'counts': list(counts.values())
    })

    return input_image, df

# Define a function to detect objects and draw bounding boxes for videos
def detect_and_draw_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    frame_shape = None
    overall_counts = {}
    detected_objects = set()  # Set to keep track of unique detections

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.resize(frame, (640, 480))

        results = model(frame)
        detections = results.xyxy[0].numpy()

        for detection in detections:
            xmin, ymin, xmax, ymax, conf, class_id = detection

            # Create a unique identifier for the object based on its bounding box
            identifier = (model.names[int(class_id)], int((xmin + xmax) / 2), int((ymin + ymax) / 2))

            # Count the object only if it hasn't been detected before
            if identifier not in detected_objects:
                detected_objects.add(identifier)
                label = model.names[int(class_id)]
                overall_counts[label] = overall_counts.get(label, 0) + 1

            cv2.rectangle(frame, (int(xmin), int(ymin)), (int(xmax), int(ymax)), (255, 0, 0), 2)
            cv2.putText(frame, f"{model.names[int(class_id)]}: {conf:.2f}", (int(xmin), int(ymin) - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255, 255, 255), 2)

        frames.append(frame)

    cap.release()

    if frame_shape is None:
        return None, None

    output_path = 'output.mp4'
    out = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'mp4v'), 20.0, (640, 480))

    for frame in frames:
        out.write(frame)
    out.release()

    # Translate counts to Arabic
    translated_counts = translator(list(overall_counts.keys()))

    df = pd.DataFrame({
        'label (English)': list(overall_counts.keys()),
        'label (Arabic)': [t['translation_text'] for t in translated_counts],
        'counts': list(overall_counts.values())
    })

    return output_path, df

# Create separate interfaces for images and videos
image_interface = gr.Interface(
    fn=detect_and_draw_image,
    inputs=gr.Image(type="pil", label="Upload Image"),
    outputs=[gr.Image(type="pil"), gr.Dataframe(label="Object Counts")],
    title="Object Detection for Images",
    description="Upload an image to see the objects detected by YOLOv5 with bounding boxes and their counts."
)

video_interface = gr.Interface(
    fn=detect_and_draw_video,
    inputs=gr.Video(label="Upload Video"),
    outputs=[gr.Video(label="Processed Video"), gr.Dataframe(label="Object Counts")],
    title="Object Detection for Videos",
    description="Upload a video to see the objects detected by YOLOv5 with bounding boxes and their counts."
)

# Combine interfaces into a single app
app = gr.TabbedInterface([image_interface, video_interface], ["Image Detection", "Video Detection"])

# Launch the app
app.launch(debug=True)