Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files
app.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import cv2
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torchvision import transforms, models
|
6 |
+
import gradio as gr
|
7 |
+
|
8 |
+
class BISINDOClassifier(nn.Module):
|
9 |
+
def __init__(self, hidden_dim, num_classes, num_layers=1):
|
10 |
+
super(BISINDOClassifier, self).__init__()
|
11 |
+
self.hidden_dim = hidden_dim
|
12 |
+
self.num_classes = num_classes
|
13 |
+
self.num_layers = num_layers
|
14 |
+
|
15 |
+
# Load pre-trained VGG-19
|
16 |
+
self.vgg19 = models.vgg19(pretrained=True)
|
17 |
+
self.vgg19.classifier = nn.Sequential()
|
18 |
+
|
19 |
+
# GRU
|
20 |
+
self.gru = nn.GRU(input_size=512*7*7, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
|
21 |
+
|
22 |
+
self.fc = nn.Linear(hidden_dim, num_classes)
|
23 |
+
|
24 |
+
def forward(self, x):
|
25 |
+
batch_size, seq_length, c, h, w = x.size()
|
26 |
+
x = x.view(batch_size * seq_length, c, h, w)
|
27 |
+
|
28 |
+
with torch.no_grad():
|
29 |
+
x = self.vgg19.features(x)
|
30 |
+
|
31 |
+
x = x.view(batch_size, seq_length, -1)
|
32 |
+
|
33 |
+
h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
|
34 |
+
x, _ = self.gru(x, h0)
|
35 |
+
|
36 |
+
x = self.fc(x[:, -1, :])
|
37 |
+
|
38 |
+
return x
|
39 |
+
|
40 |
+
transform = transforms.Compose([
|
41 |
+
transforms.ToPILImage(),
|
42 |
+
transforms.Resize((224, 224)),
|
43 |
+
transforms.ToTensor(),
|
44 |
+
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
|
45 |
+
])
|
46 |
+
|
47 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
48 |
+
|
49 |
+
model = BISINDOClassifier(hidden_dim=512, num_classes=40, num_layers=1).to(device)
|
50 |
+
model.load_state_dict(torch.load('model.pth'))
|
51 |
+
|
52 |
+
classes = {'Bagaimana', 'Berapa', 'Baik', 'Selamat Malam', 'Halo', 'Pendek', 'Tidur', 'Selamat Sore', 'Membaca', 'Senang', 'Kemana', 'Dia', 'Apa Kabar', 'Saya', 'Apa', 'Kita', 'Sabar', 'Selamat Siang', 'Kalian', 'Dimana', 'Duduk', 'Kapan', 'Mereka', 'Kamu', 'Ramah', 'Makan', 'Tinggi', 'Marah', 'Berdiri', 'Melihat', 'Minum', 'Siapa', 'Selamat Pagi', 'Kami', 'Mandi', 'Menulis', 'Terima Kasih', 'Sedih', 'Bingung', 'Belajar'}
|
53 |
+
classes = sorted(list(classes))
|
54 |
+
|
55 |
+
def extract_frames(video_path, sequence_length, interval=10, img_size=(224, 224)):
|
56 |
+
cap = cv2.VideoCapture(video_path)
|
57 |
+
|
58 |
+
if not cap.isOpened():
|
59 |
+
print(f"Error opening video file {video_path}")
|
60 |
+
return []
|
61 |
+
|
62 |
+
frames = []
|
63 |
+
frame_count = 0
|
64 |
+
|
65 |
+
while len(frames) < sequence_length:
|
66 |
+
ret, frame = cap.read()
|
67 |
+
if not ret:
|
68 |
+
break
|
69 |
+
if frame_count % interval == 0:
|
70 |
+
frame = cv2.resize(frame, img_size)
|
71 |
+
frames.append(frame)
|
72 |
+
frame_count += 1
|
73 |
+
|
74 |
+
cap.release()
|
75 |
+
|
76 |
+
if len(frames) < sequence_length:
|
77 |
+
while len(frames) < sequence_length:
|
78 |
+
if frames:
|
79 |
+
frames.append(frames[-1])
|
80 |
+
else:
|
81 |
+
frames.append(cv2.resize(np.zeros((img_size[1], img_size[0], 3), dtype=np.uint8), img_size))
|
82 |
+
else:
|
83 |
+
frames = frames[:sequence_length]
|
84 |
+
|
85 |
+
return frames
|
86 |
+
|
87 |
+
def preprocess_frames(frames):
|
88 |
+
frames = [transform(frame) for frame in frames]
|
89 |
+
frames = torch.stack(frames)
|
90 |
+
return frames.unsqueeze(0)
|
91 |
+
|
92 |
+
def predict(video_path, sequence_length):
|
93 |
+
model.eval()
|
94 |
+
|
95 |
+
frames = extract_frames(video_path, sequence_length)
|
96 |
+
input_tensor = preprocess_frames(frames)
|
97 |
+
input_tensor = input_tensor.to(device)
|
98 |
+
|
99 |
+
with torch.no_grad():
|
100 |
+
output = model(input_tensor)
|
101 |
+
probabilities = nn.functional.softmax(output, dim=1)
|
102 |
+
confidence, predicted = torch.max(probabilities, 1)
|
103 |
+
|
104 |
+
confidence_score = confidence.item()
|
105 |
+
predicted_label = classes[predicted.item()]
|
106 |
+
|
107 |
+
return {"label": predicted_label, "confidence": confidence_score}
|
108 |
+
|
109 |
+
with gr.Blocks() as demo:
|
110 |
+
with gr.Row():
|
111 |
+
input_video = gr.Video(label="Input")
|
112 |
+
output_json = gr.JSON(label="Output")
|
113 |
+
process_video_btn = gr.Button("Process Video")
|
114 |
+
|
115 |
+
def process_video(video):
|
116 |
+
result = predict(video, sequence_length=10)
|
117 |
+
result["confidence"] = f"{result['confidence']:.2f}"
|
118 |
+
return result
|
119 |
+
|
120 |
+
process_video_btn.click(process_video, input_video, output_json)
|
121 |
+
|
122 |
+
demo.launch()
|
model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:390b4f8d9cc513cb6cb3e97ca9724ec0dafba0753439dd99e0c6673cb51dfae5
|
3 |
+
size 237489311
|