Spaces:
Sleeping
Sleeping
File size: 4,010 Bytes
adee3ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import cv2
import numpy as np
import torch
from torch import nn
from torchvision import transforms, models
import gradio as gr
class BISINDOClassifier(nn.Module):
def __init__(self, hidden_dim, num_classes, num_layers=1):
super(BISINDOClassifier, self).__init__()
self.hidden_dim = hidden_dim
self.num_classes = num_classes
self.num_layers = num_layers
# Load pre-trained VGG-19
self.vgg19 = models.vgg19(pretrained=True)
self.vgg19.classifier = nn.Sequential()
# GRU
self.gru = nn.GRU(input_size=512*7*7, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, num_classes)
def forward(self, x):
batch_size, seq_length, c, h, w = x.size()
x = x.view(batch_size * seq_length, c, h, w)
with torch.no_grad():
x = self.vgg19.features(x)
x = x.view(batch_size, seq_length, -1)
h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
x, _ = self.gru(x, h0)
x = self.fc(x[:, -1, :])
return x
transform = transforms.Compose([
transforms.ToPILImage(),
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BISINDOClassifier(hidden_dim=512, num_classes=40, num_layers=1).to(device)
model.load_state_dict(torch.load('model.pth'))
classes = {'Bagaimana', 'Berapa', 'Baik', 'Selamat Malam', 'Halo', 'Pendek', 'Tidur', 'Selamat Sore', 'Membaca', 'Senang', 'Kemana', 'Dia', 'Apa Kabar', 'Saya', 'Apa', 'Kita', 'Sabar', 'Selamat Siang', 'Kalian', 'Dimana', 'Duduk', 'Kapan', 'Mereka', 'Kamu', 'Ramah', 'Makan', 'Tinggi', 'Marah', 'Berdiri', 'Melihat', 'Minum', 'Siapa', 'Selamat Pagi', 'Kami', 'Mandi', 'Menulis', 'Terima Kasih', 'Sedih', 'Bingung', 'Belajar'}
classes = sorted(list(classes))
def extract_frames(video_path, sequence_length, interval=10, img_size=(224, 224)):
cap = cv2.VideoCapture(video_path)
if not cap.isOpened():
print(f"Error opening video file {video_path}")
return []
frames = []
frame_count = 0
while len(frames) < sequence_length:
ret, frame = cap.read()
if not ret:
break
if frame_count % interval == 0:
frame = cv2.resize(frame, img_size)
frames.append(frame)
frame_count += 1
cap.release()
if len(frames) < sequence_length:
while len(frames) < sequence_length:
if frames:
frames.append(frames[-1])
else:
frames.append(cv2.resize(np.zeros((img_size[1], img_size[0], 3), dtype=np.uint8), img_size))
else:
frames = frames[:sequence_length]
return frames
def preprocess_frames(frames):
frames = [transform(frame) for frame in frames]
frames = torch.stack(frames)
return frames.unsqueeze(0)
def predict(video_path, sequence_length):
model.eval()
frames = extract_frames(video_path, sequence_length)
input_tensor = preprocess_frames(frames)
input_tensor = input_tensor.to(device)
with torch.no_grad():
output = model(input_tensor)
probabilities = nn.functional.softmax(output, dim=1)
confidence, predicted = torch.max(probabilities, 1)
confidence_score = confidence.item()
predicted_label = classes[predicted.item()]
return {"label": predicted_label, "confidence": confidence_score}
with gr.Blocks() as demo:
with gr.Row():
input_video = gr.Video(label="Input")
output_json = gr.JSON(label="Output")
process_video_btn = gr.Button("Process Video")
def process_video(video):
result = predict(video, sequence_length=10)
result["confidence"] = f"{result['confidence']:.2f}"
return result
process_video_btn.click(process_video, input_video, output_json)
demo.launch() |