File size: 4,837 Bytes
100b9b2 109fee8 100b9b2 d48f382 100b9b2 01e83c8 9c69830 100b9b2 d48f382 100b9b2 a9777c0 d48f382 100b9b2 d48f382 100b9b2 d48f382 100b9b2 d48f382 100b9b2 8a4dc7e d48f382 100b9b2 15b92fc d48f382 47f97bd 84def21 d48f382 6437c14 100b9b2 6437c14 fd362dd a9777c0 fd362dd a9777c0 6fb0ffb 6437c14 fd362dd 6437c14 d48f382 100b9b2 6437c14 100b9b2 d5b8a83 d48f382 d5b8a83 d48f382 84def21 d5b8a83 47f97bd 84def21 d48f382 d5b8a83 da6e971 d48f382 d5b8a83 100b9b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 |
import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)
# Define activity categories based on detected objects
activity_categories = {
"Working": ["laptop", "computer", "keyboard", "office chair"],
"Meal Time": ["fork", "spoon", "plate", "food"],
"Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
"Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
# Add more categories and objects as needed
}
# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
categorized_activities = {}
for activity, objects in activity_categories.items():
if any(obj in detected_objects for obj in objects):
if activity not in categorized_activities:
categorized_activities[activity] = []
categorized_activities[activity].append(detected_objects)
return categorized_activities
# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
score, _ = ssim(gray_frame1, gray_frame2, full=True)
return score < threshold
# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
cap = cv2.VideoCapture(video_path)
journal_entries = []
image_paths = []
frame_count = 0
output_folder = "detected_frames"
os.makedirs(output_folder, exist_ok=True) # Create folder to store images
last_processed_second = -1 # Keep track of the last processed second
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
# Get the current timestamp in the video
current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000 # Convert ms to seconds
current_second = int(current_time) # Round down to the nearest second
# Process only one frame per second
if current_second > last_processed_second:
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
# Make predictions using YOLOv10 on the current frame
results = model.predict(source=frame_rgb, device=device)
# Filter detected objects based on confidence threshold
detected_objects = []
for box in results[0].boxes:
if box.conf >= confidence_threshold: # Only include objects with confidence >= 0.8
detected_objects.append(model.names[int(box.cls)])
# Only process frames where objects with confidence >= threshold are detected
if detected_objects: # If there are high-confidence detected objects
# Plot bounding boxes and labels on the image
annotated_frame = results[0].plot() # Plot detection results on the frame
# Save the annotated image
frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1]) # Convert back to BGR for saving
image_paths.append(frame_filename)
# Categorize the detected objects into activities
activity_summary = categorize_activity(detected_objects)
# Store the activities with their timestamp
for activity, objects in activity_summary.items():
journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")
last_processed_second = current_second # Update the last processed second
frame_count += 1
cap.release()
return journal_entries, image_paths
def display_journal_with_images(video):
journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
journal_text = "\n".join(journal_entries)
return journal_text, image_paths
with gr.Blocks() as iface:
video_input = gr.Video(label="Upload Video", height=300)
journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
image_gallery = gr.Gallery(label="Annotated Frames")
run_button = gr.Button("Generate Journal")
run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery])
iface.launch()
iface.launch()
|