IbrahimHasani commited on
Commit
73957b0
·
1 Parent(s): 23ca17f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +145 -0
app.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import numpy as np
4
+ from transformers import AutoProcessor, AutoModel
5
+ from PIL import Image
6
+ import cv2
7
+ from concurrent.futures import ThreadPoolExecutor
8
+ import os
9
+
10
+
11
+ MODEL_NAME = "microsoft/xclip-base-patch16-zero-shot"
12
+ CLIP_LEN = 32
13
+
14
+ # Check if GPU is available and set the device
15
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+ print (device)
17
+
18
+ # Load model and processor once and move them to the device
19
+ processor = AutoProcessor.from_pretrained(MODEL_NAME)
20
+ model = AutoModel.from_pretrained(MODEL_NAME).to(device)
21
+
22
+ def get_video_length(file_path):
23
+ cap = cv2.VideoCapture(file_path)
24
+ length = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
25
+ cap.release()
26
+ return length
27
+
28
+ def read_video_opencv(file_path, indices):
29
+ frames = []
30
+ with ThreadPoolExecutor() as executor:
31
+ futures = [executor.submit(get_frame, file_path, i) for i in indices]
32
+ for future in futures:
33
+ frame = future.result()
34
+ if frame is not None:
35
+ frames.append(frame)
36
+ return frames
37
+
38
+ def get_frame(file_path, index):
39
+ cap = cv2.VideoCapture(file_path)
40
+ cap.set(cv2.CAP_PROP_POS_FRAMES, index)
41
+ ret, frame = cap.read()
42
+ cap.release()
43
+ if ret:
44
+ return cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
45
+ return None
46
+
47
+ def sample_uniform_frame_indices(clip_len, seg_len):
48
+ if seg_len < clip_len:
49
+ repeat_factor = np.ceil(clip_len / seg_len).astype(int)
50
+ indices = np.arange(seg_len).tolist() * repeat_factor
51
+ indices = indices[:clip_len]
52
+ else:
53
+ spacing = seg_len // clip_len
54
+ indices = [i * spacing for i in range(clip_len)]
55
+ return np.array(indices).astype(np.int64)
56
+
57
+ def concatenate_frames(frames, clip_len):
58
+ layout = { 32: (4, 8) }
59
+ rows, cols = layout[clip_len]
60
+ combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
61
+ frame_iter = iter(frames)
62
+ y_offset = 0
63
+ for i in range(rows):
64
+ x_offset = 0
65
+ for j in range(cols):
66
+ img = Image.fromarray(next(frame_iter))
67
+ combined_image.paste(img, (x_offset, y_offset))
68
+ x_offset += frames[0].shape[1]
69
+ y_offset += frames[0].shape[0]
70
+ return combined_image
71
+
72
+ def model_interface(uploaded_video, activity):
73
+ video_length = get_video_length(uploaded_video)
74
+ indices = sample_uniform_frame_indices(CLIP_LEN, seg_len=video_length)
75
+ video = read_video_opencv(uploaded_video, indices)
76
+ concatenated_image = concatenate_frames(video, CLIP_LEN)
77
+
78
+ activities_list = [activity, "other"]
79
+ inputs = processor(
80
+ text=activities_list,
81
+ videos=list(video),
82
+ return_tensors="pt",
83
+ padding=True,
84
+ )
85
+
86
+ # Move the tensors to the same device as the model
87
+ for key, value in inputs.items():
88
+ if isinstance(value, torch.Tensor):
89
+ inputs[key] = value.to(device)
90
+
91
+ with torch.no_grad():
92
+ outputs = model(**inputs)
93
+
94
+ logits_per_video = outputs.logits_per_video
95
+ probs = logits_per_video.softmax(dim=1)
96
+
97
+ results_probs = []
98
+ results_logits = []
99
+ max_prob_index = torch.argmax(probs[0]).item()
100
+ for i in range(len(activities_list)):
101
+ current_activity = activities_list[i]
102
+ prob = float(probs[0][i].cpu()) # Move tensor data to CPU for further processing
103
+ logit = float(logits_per_video[0][i].cpu()) # Move tensor data to CPU for further processing
104
+ results_probs.append((current_activity, f"Probability: {prob * 100:.2f}%"))
105
+ results_logits.append((current_activity, f"Raw Score: {logit:.2f}"))
106
+
107
+ likely_label = activities_list[max_prob_index]
108
+ likely_probability = float(probs[0][max_prob_index].cpu()) * 100 # Move tensor data to CPU
109
+
110
+ activity_perfomed = False
111
+ if likely_label != 'other':
112
+ activity_perfomed = True
113
+
114
+ return activity_perfomed, concatenated_image, results_probs, results_logits, [likely_label, likely_probability]
115
+
116
+
117
+ # Load video paths from the folder
118
+ #video_folder = "Action Detection Samples"
119
+ #video_files = [os.path.join(video_folder, file) for file in os.listdir(video_folder) if file.endswith('.mp4')] # considering only mp4 files
120
+
121
+ # Create examples: assuming every video is about 'dancing'
122
+ #examples = [[video, "taking a shot"] for video in video_files]
123
+
124
+ iface = gr.Interface(
125
+ fn=model_interface,
126
+ inputs=[
127
+ gr.components.Video(label="Upload a video file"),
128
+ gr.components.Text(default="taking a shot", label="Desired Activity to Recognize"),
129
+ ],
130
+ outputs=[
131
+ gr.components.Text(type="text", label="True/False"),
132
+ gr.components.Image(type="pil", label="Sampled Frames"),
133
+ gr.components.Text(type="text", label="Probabilities"),
134
+ gr.components.Text(type="text", label="Raw Scores"),
135
+ gr.components.Text(type="text", label="Top Prediction"),
136
+
137
+ ],
138
+ title="Engagify's Action Detection Video",
139
+ description="[Author: Ibrahim Hasani] This Method uses X-CLIP [Version: ZERO SHOT / SAMPLED FRAMES = 32] to determine if an action is being performed in a video or not. (Binaray Classifier). It contrasts an Action against multiple negative labels that are supposedly far enough in the latent semantic space vs the target label. Do not use negative labels in the desired activity, rather the action to be performed.",
140
+ live=False,
141
+ theme=gr.themes.Monochrome(),
142
+ #examples=examples # Add examples to the interface
143
+ )
144
+
145
+ iface.launch()