import gradio as gr
from typing import Dict, List, Any, Optional, Tuple
import matplotlib.pyplot as plt
from detection_model import DetectionModel
from style import Style
class UIManager:
"""
Manages all UI-related functionality
Handles Gradio interface creation, component definitions, and event binding.
"""
def __init__(self):
"""Initialize the UI Manager."""
self.available_models = None
self.model_choices = []
self.class_choices_formatted = []
self._setup_model_choices()
def _setup_model_choices(self):
"""Setup model choices for dropdowns."""
try:
self.available_models = DetectionModel.get_available_models()
self.model_choices = [model["model_file"] for model in self.available_models]
except ImportError:
# Fallback model choices if DetectionModel is not available
self.model_choices = ["yolov8n.pt", "yolov8s.pt", "yolov8m.pt", "yolov8l.pt", "yolov8x.pt"]
# Setup class choices
self.class_choices_formatted = [f"{id}: {name}" for id, name in self.get_all_classes()]
def get_all_classes(self):
"""
Gets all available COCO classes.
Returns:
List[Tuple[int, str]]: List of (class_id, class_name) tuples
"""
# Try to get from a loaded model first
try:
# This will be injected by the main app when processors are available
if hasattr(self, '_image_processor') and self._image_processor and self._image_processor.model_instances:
for model_instance in self._image_processor.model_instances.values():
if model_instance and model_instance.is_model_loaded:
try:
# Ensure class_names is a dict {id: name}
if isinstance(model_instance.class_names, dict):
return sorted([(int(idx), name) for idx, name in model_instance.class_names.items()])
except Exception as e:
print(f"Error getting class names from model: {e}")
except Exception:
pass
# COCO Classes
default_classes = {
0: 'person', 1: 'bicycle', 2: 'car', 3: 'motorcycle', 4: 'airplane', 5: 'bus',
6: 'train', 7: 'truck', 8: 'boat', 9: 'traffic light', 10: 'fire hydrant',
11: 'stop sign', 12: 'parking meter', 13: 'bench', 14: 'bird', 15: 'cat',
16: 'dog', 17: 'horse', 18: 'sheep', 19: 'cow', 20: 'elephant', 21: 'bear',
22: 'zebra', 23: 'giraffe', 24: 'backpack', 25: 'umbrella', 26: 'handbag',
27: 'tie', 28: 'suitcase', 29: 'frisbee', 30: 'skis', 31: 'snowboard',
32: 'sports ball', 33: 'kite', 34: 'baseball bat', 35: 'baseball glove',
36: 'skateboard', 37: 'surfboard', 38: 'tennis racket', 39: 'bottle',
40: 'wine glass', 41: 'cup', 42: 'fork', 43: 'knife', 44: 'spoon', 45: 'bowl',
46: 'banana', 47: 'apple', 48: 'sandwich', 49: 'orange', 50: 'broccoli',
51: 'carrot', 52: 'hot dog', 53: 'pizza', 54: 'donut', 55: 'cake', 56: 'chair',
57: 'couch', 58: 'potted plant', 59: 'bed', 60: 'dining table', 61: 'toilet',
62: 'tv', 63: 'laptop', 64: 'mouse', 65: 'remote', 66: 'keyboard',
67: 'cell phone', 68: 'microwave', 69: 'oven', 70: 'toaster', 71: 'sink',
72: 'refrigerator', 73: 'book', 74: 'clock', 75: 'vase', 76: 'scissors',
77: 'teddy bear', 78: 'hair drier', 79: 'toothbrush'
}
return sorted(default_classes.items())
def set_image_processor(self, image_processor):
"""
Set the image processor reference for dynamic class retrieval.
Args:
image_processor: The ImageProcessor instance
"""
self._image_processor = image_processor
def get_css_styles(self):
"""
Get CSS styles for the interface.
Returns:
str: CSS styles
"""
try:
return Style.get_css()
except ImportError:
# fallback defualt CSS style
return """
.app-header {
text-align: center;
padding: 2rem 0 3rem 0;
background: linear-gradient(135deg, #f0f9ff, #e1f5fe);
}
.section-heading {
font-size: 1.2rem;
font-weight: bold;
color: #2D3748;
margin: 1rem 0 0.5rem 0;
}
.detect-btn {
background: linear-gradient(90deg, #38b2ac, #4299e1) !important;
color: white !important;
border: none !important;
border-radius: 8px !important;
}
.video-summary-content-wrapper {
max-height: 400px;
overflow-y: auto;
background-color: #f8f9fa;
border-radius: 8px;
padding: 15px;
border: 1px solid #e2e8f0;
}
"""
def get_model_description(self, model_name):
"""
Get model description for the given model name.
Args:
model_name: Name of the model
Returns:
str: Model description
"""
try:
return DetectionModel.get_model_description(model_name)
except ImportError:
return f"Model: {model_name}"
def create_header(self):
"""
Create the application header.
Returns:
gr.HTML: Header HTML component
"""
return gr.HTML("""
VisionScout
Object Detection and Scene Understanding
πΌοΈ Image Analysis
π¬ Video Analysis with Temporal Tracking
π± iPhone users: HEIC images may not be supported.
Convert HEIC to JPG before uploading if needed.
""")
def create_footer(self):
"""
Create the application footer.
Returns:
gr.HTML: Footer HTML component
"""
return gr.HTML("""
""")
def create_image_tab(self):
"""
Create the image processing tab with all components.
Returns:
Dict: Dictionary containing all image tab components
"""
components = {}
with gr.Tab("Image Processing"):
components['current_image_model'] = gr.State("yolov8m.pt")
with gr.Row(equal_height=False):
# Left Column: Image Input & Controls
with gr.Column(scale=4, elem_classes="input-panel"):
with gr.Group():
gr.HTML('Upload Image
')
components['image_input'] = gr.Image(
type="pil",
label="Upload an image",
elem_classes="upload-box"
)
with gr.Accordion("Image Analysis Settings", open=False):
components['image_model_dropdown'] = gr.Dropdown(
choices=self.model_choices,
value="yolov8m.pt",
label="Select Model",
info="Choose speed vs. accuracy (n=fast, m=balanced, x=accurate)"
)
components['image_model_info'] = gr.Markdown(
self.get_model_description("yolov8m.pt")
)
components['image_confidence'] = gr.Slider(
minimum=0.1, maximum=0.9, value=0.25, step=0.05,
label="Confidence Threshold",
info="Minimum confidence for displaying a detected object"
)
components['use_llm'] = gr.Checkbox(
label="Use LLM for enhanced scene descriptions",
value=True,
info="Provides more detailed and natural language descriptions (may increase processing time)"
)
components['use_landmark_detection'] = gr.Checkbox(
label="Use CLIP for Landmark Detection",
value=False,
info="Detect famous landmarks, monuments, and tourist attractions that standard object detection cannot recognize (increases processing time)"
)
with gr.Accordion("Filter Classes", open=False):
gr.HTML('Common Categories
')
with gr.Row():
components['people_btn'] = gr.Button("People", size="sm")
components['vehicles_btn'] = gr.Button("Vehicles", size="sm")
components['animals_btn'] = gr.Button("Animals", size="sm")
components['objects_btn'] = gr.Button("Common Objects", size="sm")
components['image_class_filter'] = gr.Dropdown(
choices=self.class_choices_formatted,
multiselect=True,
label="Select Classes to Display",
info="Leave empty to show all detected objects"
)
components['image_detect_btn'] = gr.Button(
"Analyze Image",
variant="primary",
elem_classes="detect-btn"
)
# How to use section
with gr.Group(elem_classes="how-to-use"):
gr.HTML('How to Use (Image)
')
gr.Markdown("""
1. Upload an image or use the camera
2. *(Optional)* Adjust settings like confidence threshold or model size (n, m = balanced, x = accurate)
3. In **Analysis Settings**, you can:
* Uncheck **Use LLM** to skip enhanced descriptions (faster)
* Check **Use CLIP for Landmark Detection** to identify famous landmarks like museums, monuments, and tourist attractions *(may take longer)*
* Filter object classes to focus on specific types of objects *(optional)*
4. Click **Analyze Image** button
**π‘ Tip:** For landmark recognition (e.g. Louvre Museum), make sure to enable **CLIP for Landmark Detection** in the settings above.
""")
# Image Examples
gr.Examples(
examples=[
"room_05.jpg",
"street_04.jpg",
"street_05.jpg",
"landmark_Louvre_01.jpg"
],
inputs=components['image_input'],
label="Example Images"
)
gr.HTML("""
π· Sample images sourced from Unsplash
""")
# Right Column: Image Results
with gr.Column(scale=6, elem_classes="output-panel"):
with gr.Tabs(elem_classes="tabs"):
# Detection Result Tab
with gr.Tab("Detection Result"):
components['image_result_image'] = gr.Image(
type="pil",
label="Detection Result"
)
gr.HTML('Detection Details
')
components['image_result_text'] = gr.Textbox(
label=None,
lines=10,
elem_id="detection-details",
container=False
)
# Scene Understanding Tab
with gr.Tab("Scene Understanding"):
gr.HTML('Scene Analysis
')
# Info details
gr.HTML("""
π The AI Vision Scout Report: Click for important notes about this analysis
About this analysis: This analysis is the model's best guess based on visible objects.
Like human scouts, it sometimes gets lost or sees things that aren't there (but don't we all?).
Consider this an educated opinion rather than absolute truth. For critical applications, always verify with human eyes! π§
""")
gr.HTML('''
Note: AI descriptions may vary slightly with each generation, reflecting the creative nature of AI. This is similar to how a person might use different words each time they describe the same image. Processing time may be longer during first use or when analyzing complex scenes, as the LLM enhancement requires additional computational resources.
''')
components['image_scene_description_html'] = gr.HTML(
label=None,
elem_id="scene_analysis_description_text"
)
# Original Scene Analysis accordion
with gr.Accordion("Original Scene Analysis", open=False, elem_id="original_scene_analysis_accordion"):
components['image_llm_description'] = gr.HTML(
label=None,
elem_id="original_scene_description_text"
)
with gr.Row():
with gr.Column(scale=1):
gr.HTML('Possible Activities
')
components['image_activities_list'] = gr.Dataframe(
headers=["Activity"],
datatype=["str"],
row_count=5,
col_count=1,
wrap=True
)
with gr.Column(scale=1):
gr.HTML('Safety Concerns
')
components['image_safety_list'] = gr.Dataframe(
headers=["Concern"],
datatype=["str"],
row_count=5,
col_count=1,
wrap=True
)
gr.HTML('Functional Zones
')
components['image_zones_json'] = gr.JSON(
label=None,
elem_classes="json-box"
)
gr.HTML('Lighting Conditions
')
components['image_lighting_info'] = gr.JSON(
label=None,
elem_classes="json-box"
)
# Statistics Tab
with gr.Tab("Statistics"):
with gr.Row():
with gr.Column(scale=3, elem_classes="plot-column"):
gr.HTML('Object Distribution
')
components['image_plot_output'] = gr.Plot(
label=None,
elem_classes="large-plot-container"
)
with gr.Column(scale=2, elem_classes="stats-column"):
gr.HTML('Detection Statistics
')
components['image_stats_json'] = gr.JSON(
label=None,
elem_classes="enhanced-json-display"
)
return components
def create_video_tab(self):
"""
Create the video processing tab with all components.
注ζοΌη§»ι€δΊθ€ιηζεΊεζζ§εΆι
οΌη°‘εηΊεΊζ¬ηη΅±θ¨εζ
Returns:
Dict: Dictionary containing all video tab components
"""
components = {}
with gr.Tab("Video Processing"):
with gr.Row(equal_height=False):
# Left Column: Video Input & Controls
with gr.Column(scale=4, elem_classes="input-panel"):
with gr.Group():
gr.HTML('Video Input
')
# Input type selection
components['video_input_type'] = gr.Radio(
["upload", "url"],
label="Input Method",
value="upload",
info="Choose how to provide the video"
)
# File upload
with gr.Group(elem_id="upload-video-group"):
components['video_input'] = gr.Video(
label="Upload a video file (MP4, AVI, MOV)",
sources=["upload"],
visible=True
)
# URL input
with gr.Group(elem_id="url-video-group"):
components['video_url_input'] = gr.Textbox(
label="Enter video URL (YouTube or direct video link)",
placeholder="https://www.youtube.com/watch?v=...",
visible=False,
elem_classes="custom-video-url-input"
)
gr.HTML("""
Note: Currently only YouTube URLs are supported. Maximum video duration is 10 minutes. Due to YouTube's anti-bot protection, some videos may not be downloadable. For protected videos, please upload a local video file instead.
""")
with gr.Accordion("Video Analysis Settings", open=True):
components['video_model_dropdown'] = gr.Dropdown(
choices=self.model_choices,
value="yolov8n.pt",
label="Select Model (Video)",
info="Faster models (like 'n') are recommended for video processing"
)
components['video_confidence'] = gr.Slider(
minimum=0.1, maximum=0.9, value=0.4, step=0.05,
label="Confidence Threshold (Video)",
info="Higher threshold reduces false detections"
)
components['video_process_interval'] = gr.Slider(
minimum=1, maximum=60, value=10, step=1,
label="Processing Interval (Frames)",
info="Analyze every Nth frame (higher value = faster processing)"
)
# η°‘εηεζθͺͺζ
gr.HTML("""
Analysis Features:
β’ Accurate object counting with duplicate detection removal
β’ Timeline analysis showing when objects first appear
β’ Duration tracking for object presence in video
β’ Simple, clear statistical summaries
""")
components['video_process_btn'] = gr.Button(
"Analyze Video",
variant="primary",
elem_classes="detect-btn"
)
# How to use section
with gr.Group(elem_classes="how-to-use"):
gr.HTML('How to Use (Video)
')
gr.Markdown("""
1. Choose your input method: Upload a file or enter a URL.
2. Adjust settings if needed:
* Use **faster models** (yolov8n) for quicker processing
* Set **larger intervals** (15+ frames) for longer videos
* Adjust **confidence threshold** to filter low-quality detections
3. Click "Analyze Video". **Processing time varies based on video length.**
4. Review the results: annotated video and statistical analysis.
**β‘ Performance Tips:**
* For videos longer than 2 minutes, use interval β₯ 15 frames
* YOLOv8n model provides best speed for video processing
* Higher confidence thresholds reduce processing noise
""")
# Video examples
gr.HTML('Example Videos
')
gr.HTML("""
""")
# Right Column: Video Results
with gr.Column(scale=6, elem_classes="output-panel video-result-panel"):
gr.HTML("""
Video Analysis Results
π¬ Simplified Video Analysis Features
Focus on practical insights: This analysis provides accurate object counts and timing information
without complex tracking. The system uses spatial clustering to eliminate duplicate detections and
provides clear timeline data showing when objects first appear and how long they remain visible.
Key benefits: Reliable object counting, clear timeline analysis, and easy-to-understand results
that directly answer questions like "How many cars are in this video?" and "When do they appear?"
""")
components['video_output'] = gr.Video(
label="Analyzed Video with Object Detection",
elem_classes="video-output-container"
)
with gr.Tabs(elem_classes="video-results-tabs"):
# Analysis Summary Tab
with gr.Tab("Analysis Summary"):
gr.HTML('Video Analysis Report
')
gr.HTML("""
This summary provides object counts, timeline information, and insights about what appears in your video.
Results are based on spatial clustering analysis to ensure accurate counting.
""")
components['video_summary_text'] = gr.HTML(
label=None,
elem_id="video-summary-html-output"
)
# Detailed Statistics Tab
with gr.Tab("Detailed Statistics"):
gr.HTML('Complete Analysis Data
')
with gr.Accordion("Processing Information", open=True):
gr.HTML("""
Basic information about video processing parameters and performance.
""")
components['video_stats_json'] = gr.JSON(
label=None,
elem_classes="video-stats-display"
)
with gr.Accordion("Object Details", open=False):
gr.HTML("""
Detailed breakdown of each object type detected, including timing and confidence information.
""")
components['video_object_details'] = gr.JSON(
label="Object-by-Object Analysis",
elem_classes="object-details-display"
)
return components
def get_filter_button_mappings(self):
"""
Get the class ID mappings for filter buttons.
Returns:
Dict: Dictionary containing class ID lists for different categories
"""
available_classes_list = self.get_all_classes()
return {
'people_classes_ids': [0],
'vehicles_classes_ids': [1, 2, 3, 4, 5, 6, 7, 8],
'animals_classes_ids': list(range(14, 24)),
'common_objects_ids': [39, 41, 42, 43, 44, 45, 56, 57, 60, 62, 63, 67, 73],
'available_classes_list': available_classes_list
}
def create_interface(self,
handle_image_upload_fn,
handle_video_upload_fn,
download_video_from_url_fn):
"""
Create the complete Gradio interface.
Args:
handle_image_upload_fn: Function to handle image upload
handle_video_upload_fn: Function to handle video upload
download_video_from_url_fn: Function to download video from URL
Returns:
gr.Blocks: Complete Gradio interface
"""
css = self.get_css_styles()
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="teal", secondary_hue="blue")) as demo:
# Header
with gr.Group(elem_classes="app-header"):
self.create_header()
# Main Content with Tabs
with gr.Tabs(elem_classes="tabs"):
# Image Processing Tab
image_components = self.create_image_tab()
# Video Processing Tab
video_components = self.create_video_tab()
# Footer
self.create_footer()
# Setup Event Listeners
self._setup_event_listeners(
image_components,
video_components,
handle_image_upload_fn,
handle_video_upload_fn
)
return demo
def _setup_event_listeners(self,
image_components,
video_components,
handle_image_upload_fn,
handle_video_upload_fn):
"""
Setup all event listeners for the interface.
Args:
image_components: Dictionary of image tab components
video_components: Dictionary of video tab components
handle_image_upload_fn: Function to handle image upload
handle_video_upload_fn: Function to handle video upload
"""
# Image Model Change Handler
image_components['image_model_dropdown'].change(
fn=lambda model: (model, self.get_model_description(model)),
inputs=[image_components['image_model_dropdown']],
outputs=[image_components['current_image_model'], image_components['image_model_info']]
)
# Image Filter Buttons
filter_mappings = self.get_filter_button_mappings()
available_classes_list = filter_mappings['available_classes_list']
people_classes_ids = filter_mappings['people_classes_ids']
vehicles_classes_ids = filter_mappings['vehicles_classes_ids']
animals_classes_ids = filter_mappings['animals_classes_ids']
common_objects_ids = filter_mappings['common_objects_ids']
image_components['people_btn'].click(
lambda: [f"{id}: {name}" for id, name in available_classes_list if id in people_classes_ids],
outputs=image_components['image_class_filter']
)
image_components['vehicles_btn'].click(
lambda: [f"{id}: {name}" for id, name in available_classes_list if id in vehicles_classes_ids],
outputs=image_components['image_class_filter']
)
image_components['animals_btn'].click(
lambda: [f"{id}: {name}" for id, name in available_classes_list if id in animals_classes_ids],
outputs=image_components['image_class_filter']
)
image_components['objects_btn'].click(
lambda: [f"{id}: {name}" for id, name in available_classes_list if id in common_objects_ids],
outputs=image_components['image_class_filter']
)
# Video Input Type Change Handler
video_components['video_input_type'].change(
fn=lambda input_type: [
# Show/hide file upload
gr.update(visible=(input_type == "upload")),
# Show/hide URL input
gr.update(visible=(input_type == "url"))
],
inputs=[video_components['video_input_type']],
outputs=[video_components['video_input'], video_components['video_url_input']]
)
# Image Detect Button Click Handler
image_components['image_detect_btn'].click(
fn=handle_image_upload_fn,
inputs=[
image_components['image_input'],
image_components['image_model_dropdown'],
image_components['image_confidence'],
image_components['image_class_filter'],
image_components['use_llm'],
image_components['use_landmark_detection']
],
outputs=[
image_components['image_result_image'],
image_components['image_result_text'],
image_components['image_stats_json'],
image_components['image_plot_output'],
image_components['image_scene_description_html'],
image_components['image_llm_description'],
image_components['image_activities_list'],
image_components['image_safety_list'],
image_components['image_zones_json'],
image_components['image_lighting_info']
]
)
# Video Process Button Click Handler
video_components['video_process_btn'].click(
fn=handle_video_upload_fn,
inputs=[
video_components['video_input'],
video_components['video_url_input'],
video_components['video_input_type'],
video_components['video_model_dropdown'],
video_components['video_confidence'],
video_components['video_process_interval']
],
outputs=[
video_components['video_output'],
video_components['video_summary_text'],
video_components['video_stats_json'],
video_components['video_object_details']
]
)