File size: 4,574 Bytes
100b9b2
 
109fee8
100b9b2
d48f382
 
100b9b2
 
01e83c8
9c69830
 
100b9b2
d48f382
100b9b2
 
 
 
 
d48f382
100b9b2
 
 
 
d48f382
100b9b2
 
 
d48f382
 
 
100b9b2
d48f382
100b9b2
8a4dc7e
 
 
 
 
 
 
 
 
d48f382
100b9b2
f5586b0
d48f382
47f97bd
d48f382
 
fd362dd
d48f382
 
100b9b2
 
 
 
 
 
fd362dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47f97bd
fd362dd
 
d48f382
 
100b9b2
 
 
47f97bd
100b9b2
d5b8a83
d48f382
d5b8a83
d48f382
d5b8a83
 
 
47f97bd
d5b8a83
d48f382
d5b8a83
 
 
d48f382
 
d5b8a83
 
 
100b9b2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)

# Define activity categories based on detected objects
activity_categories = {
    "Working": ["laptop", "computer", "keyboard", "office chair"],
    "Meal Time": ["fork", "spoon", "plate", "food"],
    "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
    "Outdoors": ["car", "tree", "bicycle", "road"],
    # Add more categories and objects as needed
}

# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
    categorized_activities = {}
    
    for activity, objects in activity_categories.items():
        if any(obj in detected_objects for obj in objects):
            if activity not in categorized_activities:
                categorized_activities[activity] = []
            categorized_activities[activity].append(detected_objects)
    
    return categorized_activities


# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
    gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray_frame1, gray_frame2, full=True)
    return score < threshold 


# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30):
    cap = cv2.VideoCapture(video_path)
    journal_entries = []
    saved_images = []
    frame_count = 0
    last_processed_frame = None
    output_folder = "detected_frames"
    os.makedirs(output_folder, exist_ok=True)  # Create folder to store images
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Process every Nth frame or if the current frame is different from the last processed frame
        if frame_count % frame_interval == 0 or (last_processed_frame is not None and is_frame_different(last_processed_frame, frame)):
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Make predictions using YOLOv10 on the current frame
            results = model.predict(source=frame_rgb, device=device)
            
            # Plot bounding boxes and labels on the image
            annotated_frame = results[0].plot()  # Plot detection results on the frame
            
            # Save the annotated image
            frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
            cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1])  # Convert back to BGR for saving
            saved_images.append(frame_filename)
            
            # Extract labels (class indices) and map them to class names
            detected_objects = [model.names[int(box.cls)] for box in results[0].boxes]  # Access the first result
            
            # Get current timestamp in the video
            timestamp = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert ms to seconds
            
            # Categorize the detected objects into activities
            activity_summary = categorize_activity(detected_objects)
            
            # Store the activities with their timestamp
            for activity, objects in activity_summary.items():
                journal_entries.append((f"At {timestamp:.2f} seconds: {', '.join(objects[0])}", frame_filename))
            
            last_processed_frame = frame  # Update the last processed frame
        
        frame_count += 1
    
    cap.release()
    
    return journal_entries


def display_journal_with_images(video):
    journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
    
    # Return journal text and list of images separately
    journal_text = "\n".join(journal_entries)
    return journal_text, image_paths

# Define Gradio Blocks for custom display
with gr.Blocks() as iface:
    video_input = gr.Video(label="Upload Video", height=300)  
    journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
    image_gallery = gr.Gallery(label="Annotated Frames").style(grid=[2], height="auto")
    run_button = gr.Button("Generate Journal")
    
    run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery])

iface.launch()

iface.launch()