File size: 4,837 Bytes
100b9b2
 
109fee8
100b9b2
d48f382
 
100b9b2
 
01e83c8
9c69830
 
100b9b2
d48f382
100b9b2
 
 
 
a9777c0
d48f382
100b9b2
 
 
 
d48f382
100b9b2
 
 
d48f382
 
 
100b9b2
d48f382
100b9b2
8a4dc7e
 
 
 
 
 
 
 
 
d48f382
100b9b2
15b92fc
d48f382
47f97bd
84def21
d48f382
 
 
6437c14
 
100b9b2
 
 
 
 
 
6437c14
 
 
 
 
 
fd362dd
 
 
 
 
a9777c0
 
 
 
 
fd362dd
a9777c0
 
6fb0ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
6437c14
fd362dd
6437c14
d48f382
 
100b9b2
 
 
6437c14
100b9b2
d5b8a83
d48f382
d5b8a83
d48f382
84def21
d5b8a83
 
47f97bd
84def21
d48f382
d5b8a83
 
da6e971
d48f382
 
d5b8a83
 
 
100b9b2
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import gradio as gr
from ultralytics import YOLOv10
from skimage.metrics import structural_similarity as ssim
import cv2
import torch
import os
import spaces


device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = YOLOv10.from_pretrained('jameslahm/yolov10x').to(device)

# Define activity categories based on detected objects
activity_categories = {
    "Working": ["laptop", "computer", "keyboard", "office chair"],
    "Meal Time": ["fork", "spoon", "plate", "food"],
    "Exercise": ["dumbbell", "bicycle", "yoga mat", "treadmill"],
    "Outdoors": ["car", "tree", "bicycle", "road","subway","metro"],
    # Add more categories and objects as needed
}

# Function to map detected objects to categorized activities
def categorize_activity(detected_objects):
    categorized_activities = {}
    
    for activity, objects in activity_categories.items():
        if any(obj in detected_objects for obj in objects):
            if activity not in categorized_activities:
                categorized_activities[activity] = []
            categorized_activities[activity].append(detected_objects)
    
    return categorized_activities


# Function to compare frames using SSIM to avoid repeated frames
def is_frame_different(frame1, frame2, threshold=0.9):
    gray_frame1 = cv2.cvtColor(frame1, cv2.COLOR_BGR2GRAY)
    gray_frame2 = cv2.cvtColor(frame2, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(gray_frame1, gray_frame2, full=True)
    return score < threshold 


# Function to process the video, detect objects, and generate a categorized journal with images
@spaces.GPU
def generate_journal_with_images(video_path, frame_interval=30,confidence_threshold=0.8):
    cap = cv2.VideoCapture(video_path)
    journal_entries = []
    image_paths = []
    frame_count = 0
    output_folder = "detected_frames"
    os.makedirs(output_folder, exist_ok=True)  # Create folder to store images

    last_processed_second = -1  # Keep track of the last processed second
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # Get the current timestamp in the video
        current_time = cap.get(cv2.CAP_PROP_POS_MSEC) / 1000  # Convert ms to seconds
        current_second = int(current_time)  # Round down to the nearest second

        # Process only one frame per second
        if current_second > last_processed_second:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            
            # Make predictions using YOLOv10 on the current frame
            results = model.predict(source=frame_rgb, device=device)
            
            # Filter detected objects based on confidence threshold
            detected_objects = []
            for box in results[0].boxes:
                if box.conf >= confidence_threshold:  # Only include objects with confidence >= 0.8
                    detected_objects.append(model.names[int(box.cls)])
            
            # Only process frames where objects with confidence >= threshold are detected
            if detected_objects:  # If there are high-confidence detected objects
                
                # Plot bounding boxes and labels on the image
                annotated_frame = results[0].plot()  # Plot detection results on the frame
                
                # Save the annotated image
                frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
                cv2.imwrite(frame_filename, annotated_frame[:, :, ::-1])  # Convert back to BGR for saving
                image_paths.append(frame_filename)
                
                # Categorize the detected objects into activities
                activity_summary = categorize_activity(detected_objects)
                
                # Store the activities with their timestamp
                for activity, objects in activity_summary.items():
                    journal_entries.append(f"At {current_time:.2f} seconds: {', '.join(objects[0])}")
            
            last_processed_second = current_second  # Update the last processed second
        
        frame_count += 1
    
    cap.release()
    
    return journal_entries, image_paths  


def display_journal_with_images(video):
    journal_entries, image_paths = generate_journal_with_images(video, frame_interval=30)
    

    journal_text = "\n".join(journal_entries)
    return journal_text, image_paths


with gr.Blocks() as iface:
    video_input = gr.Video(label="Upload Video", height=300)  
    journal_output = gr.Textbox(label="Generated Daily Journal", lines=10)
    image_gallery = gr.Gallery(label="Annotated Frames")
    run_button = gr.Button("Generate Journal")
    
    run_button.click(fn=display_journal_with_images, inputs=video_input, outputs=[journal_output, image_gallery])

iface.launch()

iface.launch()