File size: 4,010 Bytes
adee3ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import cv2
import numpy as np
import torch
from torch import nn
from torchvision import transforms, models
import gradio as gr

class BISINDOClassifier(nn.Module):
    def __init__(self, hidden_dim, num_classes, num_layers=1):
        super(BISINDOClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_classes = num_classes
        self.num_layers = num_layers

        # Load pre-trained VGG-19
        self.vgg19 = models.vgg19(pretrained=True)
        self.vgg19.classifier = nn.Sequential()

        # GRU
        self.gru = nn.GRU(input_size=512*7*7, hidden_size=hidden_dim, num_layers=num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        batch_size, seq_length, c, h, w = x.size()
        x = x.view(batch_size * seq_length, c, h, w)

        with torch.no_grad():
            x = self.vgg19.features(x)

        x = x.view(batch_size, seq_length, -1)

        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        x, _ = self.gru(x, h0)

        x = self.fc(x[:, -1, :])

        return x

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = BISINDOClassifier(hidden_dim=512, num_classes=40, num_layers=1).to(device)
model.load_state_dict(torch.load('model.pth'))

classes = {'Bagaimana', 'Berapa', 'Baik', 'Selamat Malam', 'Halo', 'Pendek', 'Tidur', 'Selamat Sore', 'Membaca', 'Senang', 'Kemana', 'Dia', 'Apa Kabar', 'Saya', 'Apa', 'Kita', 'Sabar', 'Selamat Siang', 'Kalian', 'Dimana', 'Duduk', 'Kapan', 'Mereka', 'Kamu', 'Ramah', 'Makan', 'Tinggi', 'Marah', 'Berdiri', 'Melihat', 'Minum', 'Siapa', 'Selamat Pagi', 'Kami', 'Mandi', 'Menulis', 'Terima Kasih', 'Sedih', 'Bingung', 'Belajar'}
classes = sorted(list(classes))

def extract_frames(video_path, sequence_length, interval=10, img_size=(224, 224)):
    cap = cv2.VideoCapture(video_path)

    if not cap.isOpened():
        print(f"Error opening video file {video_path}")
        return []

    frames = []
    frame_count = 0

    while len(frames) < sequence_length:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % interval == 0:
            frame = cv2.resize(frame, img_size)
            frames.append(frame)
        frame_count += 1

    cap.release()

    if len(frames) < sequence_length:
        while len(frames) < sequence_length:
            if frames:
                frames.append(frames[-1])
            else:
                frames.append(cv2.resize(np.zeros((img_size[1], img_size[0], 3), dtype=np.uint8), img_size))
    else:
        frames = frames[:sequence_length]

    return frames

def preprocess_frames(frames):
    frames = [transform(frame) for frame in frames]
    frames = torch.stack(frames)
    return frames.unsqueeze(0)

def predict(video_path, sequence_length):
    model.eval()

    frames = extract_frames(video_path, sequence_length)
    input_tensor = preprocess_frames(frames)
    input_tensor = input_tensor.to(device)

    with torch.no_grad():
        output = model(input_tensor)
        probabilities = nn.functional.softmax(output, dim=1)
        confidence, predicted = torch.max(probabilities, 1)

    confidence_score = confidence.item()
    predicted_label = classes[predicted.item()]

    return {"label": predicted_label, "confidence": confidence_score}

with gr.Blocks() as demo:
    with gr.Row():
        input_video = gr.Video(label="Input")
        output_json = gr.JSON(label="Output")
        process_video_btn = gr.Button("Process Video")

    def process_video(video):
        result = predict(video, sequence_length=10)
        result["confidence"] = f"{result['confidence']:.2f}"
        return result

    process_video_btn.click(process_video, input_video, output_json)

demo.launch()