Upload 19 files
Browse files- .gitattributes +7 -0
- .prettierrc +11 -0
- app/__init__.py +62 -0
- app/camera/__init__.py +58 -0
- app/checkpoints/xaa +3 -0
- app/checkpoints/xab +3 -0
- app/checkpoints/xac +3 -0
- app/checkpoints/xad +3 -0
- app/checkpoints/xae +3 -0
- app/checkpoints/xaf +3 -0
- app/config/__init__.py +9 -0
- app/data/info.pkl +3 -0
- app/models/__init__.py +444 -0
- app/server/__init__.py +50 -0
- app/translator/__init__.py +245 -0
- main.py +4 -0
- public/index.css +20 -0
- public/index.html +67 -0
- requirements.txt +14 -9
- resources/demo.gif +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
app/checkpoints/xaa filter=lfs diff=lfs merge=lfs -text
|
37 |
+
app/checkpoints/xab filter=lfs diff=lfs merge=lfs -text
|
38 |
+
app/checkpoints/xac filter=lfs diff=lfs merge=lfs -text
|
39 |
+
app/checkpoints/xad filter=lfs diff=lfs merge=lfs -text
|
40 |
+
app/checkpoints/xae filter=lfs diff=lfs merge=lfs -text
|
41 |
+
app/checkpoints/xaf filter=lfs diff=lfs merge=lfs -text
|
42 |
+
resources/demo.gif filter=lfs diff=lfs merge=lfs -text
|
.prettierrc
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"printWidth": 120,
|
3 |
+
"tabWidth": 2,
|
4 |
+
"useTabs": false,
|
5 |
+
"semi": false,
|
6 |
+
"singleQuote": true,
|
7 |
+
"trailingComma": "none",
|
8 |
+
"bracketSpacing": true,
|
9 |
+
"bracketSameLine": true,
|
10 |
+
"arrowParens": "always"
|
11 |
+
}
|
app/__init__.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from sys import argv
|
2 |
+
from time import sleep
|
3 |
+
|
4 |
+
import cv2 as cv
|
5 |
+
|
6 |
+
from app.camera import Camera
|
7 |
+
from app.server import HTTPDaemon
|
8 |
+
from app.translator import Translator
|
9 |
+
|
10 |
+
|
11 |
+
def camera_loop(camera: Camera, translator: Translator):
|
12 |
+
|
13 |
+
retry_count = 0
|
14 |
+
|
15 |
+
while not camera.is_capturing():
|
16 |
+
if retry_count > 3:
|
17 |
+
raise Exception("No camera frames found.")
|
18 |
+
|
19 |
+
retry_count += 1
|
20 |
+
sleep(1)
|
21 |
+
|
22 |
+
while True:
|
23 |
+
cv.imshow('Input', camera.buffer[-1])
|
24 |
+
translator.video_to_asl(camera.buffer)
|
25 |
+
|
26 |
+
if cv.waitKey(1) == 27:
|
27 |
+
break
|
28 |
+
|
29 |
+
cv.destroyAllWindows()
|
30 |
+
|
31 |
+
|
32 |
+
def main(translator: Translator):
|
33 |
+
|
34 |
+
with Camera(0, 64) as camera:
|
35 |
+
camera_loop(camera, translator)
|
36 |
+
|
37 |
+
|
38 |
+
def init_server():
|
39 |
+
|
40 |
+
host = 'localhost'
|
41 |
+
port = parse_args()
|
42 |
+
translator = Translator(confidence=0.7)
|
43 |
+
|
44 |
+
with HTTPDaemon(host, port, translator):
|
45 |
+
try:
|
46 |
+
main(translator)
|
47 |
+
|
48 |
+
except KeyboardInterrupt:
|
49 |
+
print("\nManual exit detected.")
|
50 |
+
|
51 |
+
finally:
|
52 |
+
print("Exiting..")
|
53 |
+
|
54 |
+
|
55 |
+
def parse_args() -> int:
|
56 |
+
|
57 |
+
try:
|
58 |
+
return 5000 if len(argv) < 2 else int(argv[1])
|
59 |
+
|
60 |
+
except ValueError:
|
61 |
+
print("\nPort must be an integer.\ne.g. python server.py 5000\n")
|
62 |
+
raise
|
app/camera/__init__.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from collections import deque
|
2 |
+
from threading import Thread
|
3 |
+
|
4 |
+
from cv2 import VideoCapture
|
5 |
+
from typing_extensions import Self
|
6 |
+
|
7 |
+
|
8 |
+
class Camera:
|
9 |
+
|
10 |
+
def __init__(self, capture_id: int=0, buffer_size: int=1):
|
11 |
+
|
12 |
+
self.capture_id = capture_id
|
13 |
+
self.buffer_size = buffer_size
|
14 |
+
self.capture: VideoCapture
|
15 |
+
self.capture_thread: Thread
|
16 |
+
self.buffer = deque([], buffer_size)
|
17 |
+
self.stop_capture = False
|
18 |
+
|
19 |
+
|
20 |
+
def __enter__(self) -> Self:
|
21 |
+
|
22 |
+
self.capture = VideoCapture(self.capture_id)
|
23 |
+
|
24 |
+
if not self.capture.isOpened():
|
25 |
+
raise IOError("Unable to open device.")
|
26 |
+
|
27 |
+
self.capture_thread = Thread(target=self.start_capture)
|
28 |
+
self.capture_thread.start()
|
29 |
+
|
30 |
+
return self
|
31 |
+
|
32 |
+
|
33 |
+
def __exit__(self, *_):
|
34 |
+
|
35 |
+
self.stop_capture = True
|
36 |
+
self.capture_thread.join()
|
37 |
+
self.capture.release()
|
38 |
+
|
39 |
+
|
40 |
+
def start_capture(self):
|
41 |
+
|
42 |
+
while True:
|
43 |
+
if self.stop_capture:
|
44 |
+
break
|
45 |
+
|
46 |
+
_, frame = self.capture.read()
|
47 |
+
|
48 |
+
if len(self.buffer) == self.buffer_size:
|
49 |
+
self.buffer.popleft()
|
50 |
+
|
51 |
+
self.buffer.append(frame)
|
52 |
+
|
53 |
+
|
54 |
+
def is_capturing(self) -> bool:
|
55 |
+
|
56 |
+
print(f"Filling buffer: {len(self.buffer)}/{self.buffer_size}")
|
57 |
+
return len(self.buffer) == self.buffer_size
|
58 |
+
|
app/checkpoints/xaa
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:07c12b25f4ba3ce4e354282940ffa010b063206eb7c676a9a404c125a657de4d
|
3 |
+
size 20971520
|
app/checkpoints/xab
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bcdb8eba8a143d5e0cf7509f66a0a560262d8b3117015e6d9b9449e2cd865ff4
|
3 |
+
size 20971520
|
app/checkpoints/xac
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:57d719a7c1c3ce68484a3fa28557aba9b91e60ed6919a7b34ba217f37f4ac477
|
3 |
+
size 20971520
|
app/checkpoints/xad
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d63027be73a90be23789000844df7344dd4cd353195ee61a8a28cfd6b7af6565
|
3 |
+
size 20971520
|
app/checkpoints/xae
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5bf557f3e89ab18a875d7be9f94e0955f8c70117792260dcda6590e3c0462924
|
3 |
+
size 20971520
|
app/checkpoints/xaf
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:935619ea8ee2a43891643a6111479d35328a889b238fa13b760acd538d8c4f71
|
3 |
+
size 9996102
|
app/config/__init__.py
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Config:
|
2 |
+
|
3 |
+
checkpoint_path = 'app/checkpoints/model.pth.tar'
|
4 |
+
vocabulary_path = "app/data/info.pkl"
|
5 |
+
number_of_classes = 2000
|
6 |
+
number_of_frames = 64
|
7 |
+
stride = 8
|
8 |
+
batch_size = 10
|
9 |
+
topk = 1
|
app/data/info.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3a38d2b934b7bb3923b4ffad96789414a346cc4f38af26943077dc1fb66dc068
|
3 |
+
size 2182947
|
app/models/__init__.py
ADDED
@@ -0,0 +1,444 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from math import ceil
|
2 |
+
|
3 |
+
from torch import cat
|
4 |
+
from torch.nn import AvgPool3d, BatchNorm3d, Conv3d, Dropout, MaxPool3d, Module
|
5 |
+
from torch.nn.functional import pad, relu
|
6 |
+
|
7 |
+
|
8 |
+
class MaxPool3dSamePadding(MaxPool3d):
|
9 |
+
def compute_pad(self, dim, s):
|
10 |
+
if s % self.stride[dim] == 0:
|
11 |
+
return max(self.kernel_size[dim] - self.stride[dim], 0)
|
12 |
+
else:
|
13 |
+
return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
|
14 |
+
|
15 |
+
def forward(self, x):
|
16 |
+
# compute 'same' padding
|
17 |
+
_, _, t, h, w = x.size()
|
18 |
+
# print t,h,ms.shaw
|
19 |
+
# out_t = np.ceil(float(t) / float(self.stride[0]))
|
20 |
+
# out_h = np.ceil(float(h) / float(self.stride[1]))
|
21 |
+
# out_w = np.ceil(float(w) / float(self.stride[2]))
|
22 |
+
# print out_t, out_h, out_w
|
23 |
+
pad_t = self.compute_pad(0, t)
|
24 |
+
pad_h = self.compute_pad(1, h)
|
25 |
+
pad_w = self.compute_pad(2, w)
|
26 |
+
# print pad_t, pad_h, pad_w
|
27 |
+
|
28 |
+
pad_t_f = pad_t // 2
|
29 |
+
pad_t_b = pad_t - pad_t_f
|
30 |
+
pad_h_f = pad_h // 2
|
31 |
+
pad_h_b = pad_h - pad_h_f
|
32 |
+
pad_w_f = pad_w // 2
|
33 |
+
pad_w_b = pad_w - pad_w_f
|
34 |
+
|
35 |
+
padding = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
|
36 |
+
x = pad(x, padding)
|
37 |
+
return super(MaxPool3dSamePadding, self).forward(x)
|
38 |
+
|
39 |
+
|
40 |
+
class Unit3D(Module):
|
41 |
+
|
42 |
+
def __init__(
|
43 |
+
self,
|
44 |
+
in_channels,
|
45 |
+
output_channels,
|
46 |
+
kernel_shape=(1, 1, 1),
|
47 |
+
stride=(1, 1, 1),
|
48 |
+
padding=0,
|
49 |
+
activation_fn=relu,
|
50 |
+
use_batch_norm=True,
|
51 |
+
use_bias=False,
|
52 |
+
name="unit_3d",
|
53 |
+
num_domains=1,
|
54 |
+
):
|
55 |
+
|
56 |
+
"""Initializes Unit3D module."""
|
57 |
+
super(Unit3D, self).__init__()
|
58 |
+
|
59 |
+
self._output_channels = output_channels
|
60 |
+
self._kernel_shape = kernel_shape
|
61 |
+
self._stride = stride
|
62 |
+
self._use_batch_norm = use_batch_norm
|
63 |
+
self._num_domains = num_domains
|
64 |
+
self._activation_fn = activation_fn
|
65 |
+
self._use_bias = use_bias
|
66 |
+
self.name = name
|
67 |
+
self.padding = padding
|
68 |
+
|
69 |
+
self.conv3d = Conv3d(
|
70 |
+
in_channels=in_channels,
|
71 |
+
out_channels=self._output_channels,
|
72 |
+
kernel_size=self._kernel_shape,
|
73 |
+
stride=self._stride,
|
74 |
+
padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
|
75 |
+
bias=self._use_bias,
|
76 |
+
)
|
77 |
+
|
78 |
+
if self._use_batch_norm:
|
79 |
+
if self._num_domains == 1:
|
80 |
+
self.bn = BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
|
81 |
+
|
82 |
+
|
83 |
+
def compute_pad(self, dim, s):
|
84 |
+
if s % self._stride[dim] == 0:
|
85 |
+
return max(self._kernel_shape[dim] - self._stride[dim], 0)
|
86 |
+
else:
|
87 |
+
return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
|
88 |
+
|
89 |
+
|
90 |
+
def forward(self, x):
|
91 |
+
# compute 'same' padding
|
92 |
+
_, _, t, h, w = x.size()
|
93 |
+
# print t,h,w
|
94 |
+
# out_t = np.ceil(float(t) / float(self._stride[0]))
|
95 |
+
# out_h = np.ceil(float(h) / float(self._stride[1]))
|
96 |
+
# out_w = np.ceil(float(w) / float(self._stride[2]))
|
97 |
+
# print out_t, out_h, out_w
|
98 |
+
pad_t = self.compute_pad(0, t)
|
99 |
+
pad_h = self.compute_pad(1, h)
|
100 |
+
pad_w = self.compute_pad(2, w)
|
101 |
+
# print pad_t, pad_h, pad_w
|
102 |
+
|
103 |
+
pad_t_f = pad_t // 2
|
104 |
+
pad_t_b = pad_t - pad_t_f
|
105 |
+
pad_h_f = pad_h // 2
|
106 |
+
pad_h_b = pad_h - pad_h_f
|
107 |
+
pad_w_f = pad_w // 2
|
108 |
+
pad_w_b = pad_w - pad_w_f
|
109 |
+
|
110 |
+
padding = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
|
111 |
+
# print x.size()
|
112 |
+
# print pad
|
113 |
+
x = pad(x, padding)
|
114 |
+
# print x.size()
|
115 |
+
|
116 |
+
x = self.conv3d(x)
|
117 |
+
if self._use_batch_norm:
|
118 |
+
x = self.bn(x)
|
119 |
+
if self._activation_fn is not None:
|
120 |
+
x = self._activation_fn(x)
|
121 |
+
return x
|
122 |
+
|
123 |
+
|
124 |
+
class InceptionModule(Module):
|
125 |
+
def __init__(self, in_channels, out_channels, name, num_domains=1):
|
126 |
+
super(InceptionModule, self).__init__()
|
127 |
+
|
128 |
+
self.b0 = Unit3D(
|
129 |
+
in_channels=in_channels,
|
130 |
+
output_channels=out_channels[0],
|
131 |
+
kernel_shape=[1, 1, 1],
|
132 |
+
padding=0,
|
133 |
+
name=name + "/Branch_0/Conv3d_0a_1x1",
|
134 |
+
)
|
135 |
+
self.b1a = Unit3D(
|
136 |
+
in_channels=in_channels,
|
137 |
+
output_channels=out_channels[1],
|
138 |
+
kernel_shape=[1, 1, 1],
|
139 |
+
padding=0,
|
140 |
+
name=name + "/Branch_1/Conv3d_0a_1x1",
|
141 |
+
)
|
142 |
+
self.b1b = Unit3D(
|
143 |
+
in_channels=out_channels[1],
|
144 |
+
output_channels=out_channels[2],
|
145 |
+
kernel_shape=[3, 3, 3],
|
146 |
+
name=name + "/Branch_1/Conv3d_0b_3x3",
|
147 |
+
)
|
148 |
+
self.b2a = Unit3D(
|
149 |
+
in_channels=in_channels,
|
150 |
+
output_channels=out_channels[3],
|
151 |
+
kernel_shape=[1, 1, 1],
|
152 |
+
padding=0,
|
153 |
+
name=name + "/Branch_2/Conv3d_0a_1x1",
|
154 |
+
)
|
155 |
+
self.b2b = Unit3D(
|
156 |
+
in_channels=out_channels[3],
|
157 |
+
output_channels=out_channels[4],
|
158 |
+
kernel_shape=[3, 3, 3],
|
159 |
+
name=name + "/Branch_2/Conv3d_0b_3x3",
|
160 |
+
)
|
161 |
+
self.b3a = MaxPool3dSamePadding(
|
162 |
+
kernel_size=[3, 3, 3], stride=(1, 1, 1), padding=0
|
163 |
+
)
|
164 |
+
self.b3b = Unit3D(
|
165 |
+
in_channels=in_channels,
|
166 |
+
output_channels=out_channels[5],
|
167 |
+
kernel_shape=[1, 1, 1],
|
168 |
+
padding=0,
|
169 |
+
name=name + "/Branch_3/Conv3d_0b_1x1",
|
170 |
+
)
|
171 |
+
self.name = name
|
172 |
+
|
173 |
+
def forward(self, x):
|
174 |
+
b0 = self.b0(x)
|
175 |
+
b1 = self.b1b(self.b1a(x))
|
176 |
+
b2 = self.b2b(self.b2a(x))
|
177 |
+
b3 = self.b3b(self.b3a(x))
|
178 |
+
return cat([b0, b1, b2, b3], dim=1)
|
179 |
+
|
180 |
+
|
181 |
+
class InceptionI3d(Module):
|
182 |
+
"""Inception-v1 I3D architecture.
|
183 |
+
The model is introduced in:
|
184 |
+
Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
|
185 |
+
Joao Carreira, Andrew Zisserman
|
186 |
+
https://arxiv.org/pdf/1705.07750v1.pdf.
|
187 |
+
See also the Inception architecture, introduced in:
|
188 |
+
Going deeper with convolutions
|
189 |
+
Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
|
190 |
+
Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
|
191 |
+
http://arxiv.org/pdf/1409.4842v1.pdf.
|
192 |
+
"""
|
193 |
+
|
194 |
+
# Endpoints of the model in order. During construction, all the endpoints up
|
195 |
+
# to a designated `final_endpoint` are returned in a dictionary as the
|
196 |
+
# second return value.
|
197 |
+
VALID_ENDPOINTS = (
|
198 |
+
"Conv3d_1a_7x7",
|
199 |
+
"MaxPool3d_2a_3x3",
|
200 |
+
"Conv3d_2b_1x1",
|
201 |
+
"Conv3d_2c_3x3",
|
202 |
+
"MaxPool3d_3a_3x3",
|
203 |
+
"Mixed_3b",
|
204 |
+
"Mixed_3c",
|
205 |
+
"MaxPool3d_4a_3x3",
|
206 |
+
"Mixed_4b",
|
207 |
+
"Mixed_4c",
|
208 |
+
"Mixed_4d",
|
209 |
+
"Mixed_4e",
|
210 |
+
"Mixed_4f",
|
211 |
+
"MaxPool3d_5a_2x2",
|
212 |
+
"Mixed_5b",
|
213 |
+
"Mixed_5c",
|
214 |
+
"Logits",
|
215 |
+
"Predictions",
|
216 |
+
)
|
217 |
+
|
218 |
+
def __init__(
|
219 |
+
self,
|
220 |
+
num_classes=400,
|
221 |
+
spatiotemporal_squeeze=True,
|
222 |
+
final_endpoint="Logits",
|
223 |
+
name="inception_i3d",
|
224 |
+
in_channels=3,
|
225 |
+
dropout_keep_prob=0.5,
|
226 |
+
num_in_frames=64,
|
227 |
+
include_embds=False,
|
228 |
+
):
|
229 |
+
"""Initializes I3D model instance.
|
230 |
+
Args:
|
231 |
+
num_classes: The number of outputs in the logit layer (default 400, which
|
232 |
+
matches the Kinetics dataset).
|
233 |
+
spatiotemporal_squeeze: Whether to squeeze the 2 spatial and 1 temporal dimensions for the logits
|
234 |
+
before returning (default True).
|
235 |
+
final_endpoint: The model contains many possible endpoints.
|
236 |
+
`final_endpoint` specifies the last endpoint for the model to be built
|
237 |
+
up to. In addition to the output at `final_endpoint`, all the outputs
|
238 |
+
at endpoints up to `final_endpoint` will also be returned, in a
|
239 |
+
dictionary. `final_endpoint` must be one of
|
240 |
+
InceptionI3d.VALID_ENDPOINTS (default 'Logits').
|
241 |
+
in_channels: Number of input channels (default 3 for RGB).
|
242 |
+
dropout_keep_prob: Dropout probability (default 0.5).
|
243 |
+
name: A string (optional). The name of this module.
|
244 |
+
num_in_frames: Number of input frames (default 64).
|
245 |
+
include_embds: Whether to return embeddings (default False).
|
246 |
+
Raises:
|
247 |
+
ValueError: if `final_endpoint` is not recognized.
|
248 |
+
"""
|
249 |
+
|
250 |
+
if final_endpoint not in self.VALID_ENDPOINTS:
|
251 |
+
raise ValueError("Unknown final endpoint %s" % final_endpoint)
|
252 |
+
|
253 |
+
super().__init__()
|
254 |
+
self._num_classes = num_classes
|
255 |
+
self._spatiotemporal_squeeze = spatiotemporal_squeeze
|
256 |
+
self._final_endpoint = final_endpoint
|
257 |
+
self.include_embds = include_embds
|
258 |
+
self.logits = None
|
259 |
+
|
260 |
+
if self._final_endpoint not in self.VALID_ENDPOINTS:
|
261 |
+
raise ValueError("Unknown final endpoint %s" % self._final_endpoint)
|
262 |
+
|
263 |
+
self.end_points = {}
|
264 |
+
end_point = "Conv3d_1a_7x7"
|
265 |
+
self.end_points[end_point] = Unit3D(
|
266 |
+
in_channels=in_channels,
|
267 |
+
output_channels=64,
|
268 |
+
kernel_shape=[7, 7, 7],
|
269 |
+
stride=(2, 2, 2),
|
270 |
+
padding=(3, 3, 3),
|
271 |
+
name=name + end_point,
|
272 |
+
)
|
273 |
+
if self._final_endpoint == end_point:
|
274 |
+
return
|
275 |
+
|
276 |
+
end_point = "MaxPool3d_2a_3x3"
|
277 |
+
self.end_points[end_point] = MaxPool3dSamePadding(
|
278 |
+
kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0
|
279 |
+
)
|
280 |
+
if self._final_endpoint == end_point:
|
281 |
+
return
|
282 |
+
|
283 |
+
end_point = "Conv3d_2b_1x1"
|
284 |
+
self.end_points[end_point] = Unit3D(
|
285 |
+
in_channels=64,
|
286 |
+
output_channels=64,
|
287 |
+
kernel_shape=[1, 1, 1],
|
288 |
+
padding=0,
|
289 |
+
name=name + end_point,
|
290 |
+
)
|
291 |
+
if self._final_endpoint == end_point:
|
292 |
+
return
|
293 |
+
|
294 |
+
end_point = "Conv3d_2c_3x3"
|
295 |
+
self.end_points[end_point] = Unit3D(
|
296 |
+
in_channels=64,
|
297 |
+
output_channels=192,
|
298 |
+
kernel_shape=[3, 3, 3],
|
299 |
+
padding=1,
|
300 |
+
name=name + end_point,
|
301 |
+
)
|
302 |
+
if self._final_endpoint == end_point:
|
303 |
+
return
|
304 |
+
|
305 |
+
end_point = "MaxPool3d_3a_3x3"
|
306 |
+
self.end_points[end_point] = MaxPool3dSamePadding(
|
307 |
+
kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0
|
308 |
+
)
|
309 |
+
if self._final_endpoint == end_point:
|
310 |
+
return
|
311 |
+
|
312 |
+
end_point = "Mixed_3b"
|
313 |
+
self.end_points[end_point] = InceptionModule(
|
314 |
+
192, [64, 96, 128, 16, 32, 32], name + end_point,
|
315 |
+
)
|
316 |
+
if self._final_endpoint == end_point:
|
317 |
+
return
|
318 |
+
|
319 |
+
end_point = "Mixed_3c"
|
320 |
+
self.end_points[end_point] = InceptionModule(
|
321 |
+
256, [128, 128, 192, 32, 96, 64], name + end_point,
|
322 |
+
)
|
323 |
+
if self._final_endpoint == end_point:
|
324 |
+
return
|
325 |
+
|
326 |
+
end_point = "MaxPool3d_4a_3x3"
|
327 |
+
self.end_points[end_point] = MaxPool3dSamePadding(
|
328 |
+
kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0
|
329 |
+
)
|
330 |
+
if self._final_endpoint == end_point:
|
331 |
+
return
|
332 |
+
|
333 |
+
end_point = "Mixed_4b"
|
334 |
+
self.end_points[end_point] = InceptionModule(
|
335 |
+
128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point,
|
336 |
+
)
|
337 |
+
if self._final_endpoint == end_point:
|
338 |
+
return
|
339 |
+
|
340 |
+
end_point = "Mixed_4c"
|
341 |
+
self.end_points[end_point] = InceptionModule(
|
342 |
+
192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point,
|
343 |
+
)
|
344 |
+
if self._final_endpoint == end_point:
|
345 |
+
return
|
346 |
+
|
347 |
+
end_point = "Mixed_4d"
|
348 |
+
self.end_points[end_point] = InceptionModule(
|
349 |
+
160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point,
|
350 |
+
)
|
351 |
+
if self._final_endpoint == end_point:
|
352 |
+
return
|
353 |
+
|
354 |
+
end_point = "Mixed_4e"
|
355 |
+
self.end_points[end_point] = InceptionModule(
|
356 |
+
128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point,
|
357 |
+
)
|
358 |
+
if self._final_endpoint == end_point:
|
359 |
+
return
|
360 |
+
|
361 |
+
end_point = "Mixed_4f"
|
362 |
+
self.end_points[end_point] = InceptionModule(
|
363 |
+
112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point,
|
364 |
+
)
|
365 |
+
if self._final_endpoint == end_point:
|
366 |
+
return
|
367 |
+
|
368 |
+
end_point = "MaxPool3d_5a_2x2"
|
369 |
+
self.end_points[end_point] = MaxPool3dSamePadding(
|
370 |
+
kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0
|
371 |
+
)
|
372 |
+
if self._final_endpoint == end_point:
|
373 |
+
return
|
374 |
+
|
375 |
+
end_point = "Mixed_5b"
|
376 |
+
self.end_points[end_point] = InceptionModule(
|
377 |
+
256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point,
|
378 |
+
)
|
379 |
+
if self._final_endpoint == end_point:
|
380 |
+
return
|
381 |
+
|
382 |
+
end_point = "Mixed_5c"
|
383 |
+
self.end_points[end_point] = InceptionModule(
|
384 |
+
256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point,
|
385 |
+
)
|
386 |
+
if self._final_endpoint == end_point:
|
387 |
+
return
|
388 |
+
|
389 |
+
end_point = "Logits"
|
390 |
+
|
391 |
+
last_duration = int(ceil(num_in_frames / 8)) # 8
|
392 |
+
last_size = 7 # int(ceil(sample_width / 32)) # this is for 224
|
393 |
+
self.avgpool = AvgPool3d((last_duration, last_size, last_size), stride=1)
|
394 |
+
|
395 |
+
self.dropout = Dropout(dropout_keep_prob)
|
396 |
+
|
397 |
+
self.logits = Unit3D(
|
398 |
+
in_channels=384 + 384 + 128 + 128,
|
399 |
+
output_channels=self._num_classes,
|
400 |
+
kernel_shape=[1, 1, 1],
|
401 |
+
padding=0,
|
402 |
+
activation_fn=None,
|
403 |
+
use_batch_norm=False,
|
404 |
+
use_bias=True,
|
405 |
+
name="logits",
|
406 |
+
)
|
407 |
+
|
408 |
+
self.build()
|
409 |
+
|
410 |
+
def replace_logits(self, num_classes):
|
411 |
+
self._num_classes = num_classes
|
412 |
+
self.logits = Unit3D(
|
413 |
+
in_channels=384 + 384 + 128 + 128,
|
414 |
+
output_channels=self._num_classes,
|
415 |
+
kernel_shape=[1, 1, 1],
|
416 |
+
padding=0,
|
417 |
+
activation_fn=None,
|
418 |
+
use_batch_norm=False,
|
419 |
+
use_bias=True,
|
420 |
+
name="logits",
|
421 |
+
)
|
422 |
+
|
423 |
+
def build(self):
|
424 |
+
for k in self.end_points.keys():
|
425 |
+
self.add_module(k, self.end_points[k])
|
426 |
+
|
427 |
+
def forward(self, x):
|
428 |
+
for end_point in self.VALID_ENDPOINTS:
|
429 |
+
if end_point in self.end_points:
|
430 |
+
x = self._modules[end_point](x)
|
431 |
+
# [batch x featuredim x 1 x 1 x 1]
|
432 |
+
embds = self.dropout(self.avgpool(x))
|
433 |
+
|
434 |
+
# [batch x classes x 1 x 1 x 1]
|
435 |
+
x = self.logits(embds)
|
436 |
+
if self._spatiotemporal_squeeze:
|
437 |
+
# [batch x classes]
|
438 |
+
logits = x.squeeze(3).squeeze(3).squeeze(2)
|
439 |
+
|
440 |
+
# logits [batch X classes]
|
441 |
+
if self.include_embds:
|
442 |
+
return {"logits": logits, "embds": embds}
|
443 |
+
else:
|
444 |
+
return {"logits": logits}
|
app/server/__init__.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from http.server import BaseHTTPRequestHandler, HTTPServer
|
2 |
+
from threading import Thread
|
3 |
+
|
4 |
+
from app.translator import Translator
|
5 |
+
|
6 |
+
|
7 |
+
class Server(BaseHTTPRequestHandler):
|
8 |
+
|
9 |
+
def __init__(self, translator: Translator):
|
10 |
+
|
11 |
+
self.translator = translator
|
12 |
+
|
13 |
+
|
14 |
+
def __call__(self, *args, **kwargs):
|
15 |
+
|
16 |
+
super().__init__(*args, **kwargs)
|
17 |
+
|
18 |
+
|
19 |
+
def do_GET(self):
|
20 |
+
|
21 |
+
self.send_response(200)
|
22 |
+
self.send_header('Access-Control-Allow-Origin', '*')
|
23 |
+
self.end_headers()
|
24 |
+
self.wfile.write(self.translator.result.encode('utf-8'))
|
25 |
+
|
26 |
+
|
27 |
+
class HTTPDaemon:
|
28 |
+
|
29 |
+
def __init__(self, host: str, port: int, translator: Translator):
|
30 |
+
|
31 |
+
self.host = host
|
32 |
+
self.port = port
|
33 |
+
self.httpd = HTTPServer((self.host, self.port), Server(translator))
|
34 |
+
|
35 |
+
self.server_thread: Thread
|
36 |
+
|
37 |
+
|
38 |
+
def __enter__(self):
|
39 |
+
|
40 |
+
print(f"Serving HTTP on {self.host} port {self.port} (http://{self.host}:{self.port}/)..")
|
41 |
+
self.server_thread = Thread(target=self.httpd.serve_forever)
|
42 |
+
self.server_thread.start()
|
43 |
+
|
44 |
+
|
45 |
+
def __exit__(self, *_):
|
46 |
+
|
47 |
+
print("\nServer closing..")
|
48 |
+
self.httpd.shutdown()
|
49 |
+
self.httpd.server_close()
|
50 |
+
self.server_thread.join()
|
app/translator/__init__.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import pickle
|
3 |
+
from collections import deque
|
4 |
+
from os.path import exists
|
5 |
+
from subprocess import call, DEVNULL
|
6 |
+
|
7 |
+
import cv2 as cv
|
8 |
+
import numpy as np
|
9 |
+
import scipy
|
10 |
+
import torch
|
11 |
+
from torch.nn import DataParallel, Module
|
12 |
+
|
13 |
+
from app.config import Config
|
14 |
+
from app.models import InceptionI3d
|
15 |
+
|
16 |
+
|
17 |
+
class Translator:
|
18 |
+
|
19 |
+
def __init__(self, confidence: float):
|
20 |
+
|
21 |
+
self.confidence = confidence
|
22 |
+
self.model = self.load_model(Config.checkpoint_path, Config.number_of_classes, Config.number_of_frames)
|
23 |
+
self.word_data = self.load_vocabulary(Config.vocabulary_path)
|
24 |
+
self.result = ""
|
25 |
+
|
26 |
+
|
27 |
+
def resize_generic(self, img, oheight, owidth, interp="bilinear", is_flow=False):
|
28 |
+
"""
|
29 |
+
Args
|
30 |
+
inp: numpy array: RGB image (H, W, 3) | video with 3*nframes (H, W, 3*nframes)
|
31 |
+
| single channel image (H, W, 1) | -- not supported: video with (nframes, 3, H, W)
|
32 |
+
"""
|
33 |
+
|
34 |
+
# resized_image = cv.resize(image, (100, 50))
|
35 |
+
ht = img.shape[0]
|
36 |
+
chn = img.shape[2]
|
37 |
+
|
38 |
+
if chn == 1:
|
39 |
+
resized_img = scipy.misc.imresize(
|
40 |
+
img.squeeze(), [oheight, owidth], interp=interp, mode="F"
|
41 |
+
).reshape((oheight, owidth, chn))
|
42 |
+
elif chn == 3:
|
43 |
+
# resized_img = scipy.misc.imresize(img, [oheight, owidth], interp=interp) # mode='F' gives an error for 3 channels
|
44 |
+
resized_img = cv.resize(img, (owidth, oheight)) # inverted compared to scipy
|
45 |
+
elif chn == 2:
|
46 |
+
# assert(is_flow)
|
47 |
+
resized_img = np.zeros((oheight, owidth, chn), dtype=img.dtype)
|
48 |
+
for t in range(chn):
|
49 |
+
# resized_img[:, :, t] = scipy.misc.imresize(img[:, :, t], [oheight, owidth], interp=interp)
|
50 |
+
# resized_img[:, :, t] = scipy.misc.imresize(img[:, :, t], [oheight, owidth], interp=interp, mode='F')
|
51 |
+
# resized_img[:, :, t] = np.array(Image.fromarray(img[:, :, t]).resize([oheight, owidth]))
|
52 |
+
resized_img[:, :, t] = scipy.ndimage.interpolation.zoom(
|
53 |
+
img[:, :, t], [oheight, owidth]
|
54 |
+
)
|
55 |
+
else:
|
56 |
+
in_chn = 3
|
57 |
+
# Workaround, would be better to pass #frames
|
58 |
+
if chn == 16:
|
59 |
+
in_chn = 1
|
60 |
+
if chn == 32:
|
61 |
+
in_chn = 2
|
62 |
+
nframes = int(chn / in_chn)
|
63 |
+
img = img.reshape(img.shape[0], img.shape[1], in_chn, nframes)
|
64 |
+
resized_img = np.zeros((oheight, owidth, in_chn, nframes), dtype=img.dtype)
|
65 |
+
for t in range(nframes):
|
66 |
+
frame = img[:, :, :, t] # img[:, :, t*3:t*3+3]
|
67 |
+
frame = cv.resize(frame, (owidth, oheight)).reshape(
|
68 |
+
oheight, owidth, in_chn
|
69 |
+
)
|
70 |
+
# frame = scipy.misc.imresize(frame, [oheight, owidth], interp=interp)
|
71 |
+
resized_img[:, :, :, t] = frame
|
72 |
+
resized_img = resized_img.reshape(
|
73 |
+
resized_img.shape[0], resized_img.shape[1], chn
|
74 |
+
)
|
75 |
+
|
76 |
+
if is_flow:
|
77 |
+
# print(oheight / ht)
|
78 |
+
# print(owidth / wd)
|
79 |
+
resized_img = resized_img * oheight / ht
|
80 |
+
return resized_img
|
81 |
+
|
82 |
+
|
83 |
+
def color_normalize(self, x, mean, std):
|
84 |
+
"""Normalize a tensor of images by subtracting (resp. dividing) by the mean (resp.
|
85 |
+
std. deviation) statistics of a dataset in RGB space.
|
86 |
+
"""
|
87 |
+
if x.dim() in {3, 4}:
|
88 |
+
if x.size(0) == 1:
|
89 |
+
x = x.repeat(3, 1, 1)
|
90 |
+
assert x.size(0) == 3, "For single video format, expected RGB along first dim"
|
91 |
+
for t, m, s in zip(x, mean, std):
|
92 |
+
t.sub_(m)
|
93 |
+
t.div_(s)
|
94 |
+
elif x.dim() == 5:
|
95 |
+
assert (
|
96 |
+
x.shape[1] == 3
|
97 |
+
), "For batched video format, expected RGB along second dim"
|
98 |
+
x[:, 0].sub_(mean[0]).div_(std[0])
|
99 |
+
x[:, 1].sub_(mean[1]).div_(std[1])
|
100 |
+
x[:, 2].sub_(mean[2]).div_(std[2])
|
101 |
+
return x
|
102 |
+
|
103 |
+
|
104 |
+
def to_torch(self, ndarray):
|
105 |
+
|
106 |
+
if type(ndarray).__module__ == "numpy":
|
107 |
+
return torch.from_numpy(ndarray)
|
108 |
+
elif not torch.is_tensor(ndarray):
|
109 |
+
raise ValueError(f"Cannot convert {type(ndarray)} to torch tensor")
|
110 |
+
return ndarray
|
111 |
+
|
112 |
+
|
113 |
+
def to_numpy(self, tensor):
|
114 |
+
|
115 |
+
if torch.is_tensor(tensor):
|
116 |
+
return tensor.cpu().numpy()
|
117 |
+
elif type(tensor).__module__ != "numpy":
|
118 |
+
raise ValueError(f"Cannot convert {type(tensor)} to numpy array")
|
119 |
+
return tensor
|
120 |
+
|
121 |
+
|
122 |
+
def im_to_numpy(self, img):
|
123 |
+
|
124 |
+
img = self.to_numpy(img)
|
125 |
+
img = np.transpose(img, (1, 2, 0)) # H*W*C
|
126 |
+
|
127 |
+
return img
|
128 |
+
|
129 |
+
|
130 |
+
def im_to_torch(self, img):
|
131 |
+
|
132 |
+
img = np.transpose(img, (2, 0, 1)) # C*H*W
|
133 |
+
img = self.to_torch(img).float()
|
134 |
+
|
135 |
+
return img / 255 if img.max() > 1 else img
|
136 |
+
|
137 |
+
|
138 |
+
def load_model(self, checkpoint_path: str, number_of_classes: int, number_of_frames: int) -> Module:
|
139 |
+
|
140 |
+
model = DataParallel(InceptionI3d(
|
141 |
+
number_of_classes,
|
142 |
+
spatiotemporal_squeeze=True,
|
143 |
+
final_endpoint='Logits',
|
144 |
+
name="inception_i3d",
|
145 |
+
in_channels=3,
|
146 |
+
dropout_keep_prob=0.5,
|
147 |
+
num_in_frames=number_of_frames
|
148 |
+
)).cuda()
|
149 |
+
|
150 |
+
if not exists(Config.checkpoint_path):
|
151 |
+
call(f'cat app/checkpoints/* >> {Config.checkpoint_path}', shell=True, stdout=DEVNULL)
|
152 |
+
|
153 |
+
checkpoint = torch.load(checkpoint_path)
|
154 |
+
model.load_state_dict(checkpoint['state_dict'])
|
155 |
+
model.eval()
|
156 |
+
|
157 |
+
return model
|
158 |
+
|
159 |
+
|
160 |
+
def load_vocabulary(self, vocabulary_path: str) -> dict:
|
161 |
+
|
162 |
+
with open(vocabulary_path, 'rb') as file:
|
163 |
+
return pickle.load(file)
|
164 |
+
|
165 |
+
|
166 |
+
def prepare_input(self, video: deque, input_resolution: int=224, resize_resolution: int=256, mean: torch.Tensor=0.5*torch.ones(3), std: torch.Tensor=1.0*torch.ones(3)) -> np.ndarray:
|
167 |
+
|
168 |
+
video_tensor = torch.stack(
|
169 |
+
[self.im_to_torch(frame[:, :, [2, 1, 0]]) for frame in video]
|
170 |
+
).permute(1, 0, 2, 3)
|
171 |
+
|
172 |
+
iC, iF, _, _ = video_tensor.shape
|
173 |
+
video_tensor_resized = np.zeros((iF, resize_resolution, resize_resolution, iC))
|
174 |
+
for t in range(iF):
|
175 |
+
tmp = video_tensor[:, t, :, :]
|
176 |
+
tmp = self.resize_generic(
|
177 |
+
self.im_to_numpy(tmp), resize_resolution, resize_resolution, interp="bilinear", is_flow=False
|
178 |
+
)
|
179 |
+
video_tensor_resized[t] = tmp
|
180 |
+
video_tensor_resized = np.transpose(video_tensor_resized, (3, 0, 1, 2))
|
181 |
+
# Center crop coords
|
182 |
+
ulx = int((resize_resolution - input_resolution) / 2)
|
183 |
+
uly = int((resize_resolution - input_resolution) / 2)
|
184 |
+
# Crop 256x256
|
185 |
+
video_tensor_resized = video_tensor_resized[:, :, uly : uly + input_resolution, ulx : ulx + input_resolution]
|
186 |
+
video_tensor_resized = self.to_torch(video_tensor_resized).float()
|
187 |
+
assert video_tensor_resized.max() <= 1
|
188 |
+
video_tensor_resized = self.color_normalize(video_tensor_resized, mean, std)
|
189 |
+
return video_tensor_resized
|
190 |
+
|
191 |
+
|
192 |
+
def sliding_windows(self, input_video: torch.Tensor, number_of_frames: int, stride: int) -> torch.Tensor:
|
193 |
+
|
194 |
+
"""
|
195 |
+
Return sliding windows and corresponding (middle) timestamp
|
196 |
+
"""
|
197 |
+
C, nFrames, H, W = input_video.shape
|
198 |
+
# If needed, pad to the minimum clip length
|
199 |
+
if nFrames < number_of_frames:
|
200 |
+
rgb_ = torch.zeros(C, number_of_frames, H, W)
|
201 |
+
rgb_[:, :nFrames] = input_video
|
202 |
+
rgb_[:, nFrames:] = input_video[:, -1].unsqueeze(1)
|
203 |
+
input_video = rgb_
|
204 |
+
nFrames = input_video.shape[1]
|
205 |
+
|
206 |
+
num_clips = math.ceil((nFrames - number_of_frames) / stride) + 1
|
207 |
+
|
208 |
+
rgb_slided = torch.zeros(num_clips, 3, number_of_frames, H, W)
|
209 |
+
# For each clip
|
210 |
+
for j in range(num_clips):
|
211 |
+
# Check if num_clips becomes 0
|
212 |
+
stride_j = j * stride
|
213 |
+
actual_clip_length = min(number_of_frames, nFrames - stride_j)
|
214 |
+
t_beg = stride_j if actual_clip_length == number_of_frames else nFrames - number_of_frames
|
215 |
+
rgb_slided[j] = input_video[:, t_beg : t_beg + number_of_frames, :, :]
|
216 |
+
|
217 |
+
return rgb_slided
|
218 |
+
|
219 |
+
|
220 |
+
def video_to_asl(self, video: deque):
|
221 |
+
|
222 |
+
input_video = self.prepare_input(video)
|
223 |
+
input_sliding_window = self.sliding_windows(input_video, Config.number_of_frames, Config.stride)
|
224 |
+
|
225 |
+
num_clips = input_sliding_window.shape[0]
|
226 |
+
# Group the clips into batches
|
227 |
+
num_batches = math.ceil(num_clips / Config.batch_size)
|
228 |
+
raw_scores = np.empty((0, Config.number_of_classes), dtype=float)
|
229 |
+
for b in range(num_batches):
|
230 |
+
inp = input_sliding_window[b * Config.batch_size : (b + 1) * Config.batch_size]
|
231 |
+
# Forward pass
|
232 |
+
out = self.model(inp)
|
233 |
+
raw_scores = np.append(raw_scores, out["logits"].cpu().detach().numpy(), axis=0)
|
234 |
+
prob_scores = scipy.special.softmax(raw_scores, axis=1)
|
235 |
+
prob_sorted = np.sort(prob_scores, axis=1)[:, ::-1]
|
236 |
+
pred_sorted = np.argsort(prob_scores, axis=1)[:, ::-1]
|
237 |
+
|
238 |
+
word_topk = np.empty((Config.topk, num_clips), dtype=object)
|
239 |
+
for k in range(Config.topk):
|
240 |
+
for i, p in enumerate(pred_sorted[:, k]):
|
241 |
+
word_topk[k, i] = self.word_data["words"][p]
|
242 |
+
prob_topk = prob_sorted[:, :Config.topk].transpose()
|
243 |
+
|
244 |
+
# print(np.array([word_topk, prob_topk]).transpose())
|
245 |
+
self.result = "" if prob_topk[0, 0] <= self.confidence else word_topk[0, 0]
|
main.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from app import init_server
|
2 |
+
|
3 |
+
if __name__ == '__main__':
|
4 |
+
init_server()
|
public/index.css
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
span {
|
2 |
+
line-height: 1.8;
|
3 |
+
word-wrap: normal;
|
4 |
+
display: inline-block;
|
5 |
+
padding-right: 0.3em;
|
6 |
+
}
|
7 |
+
|
8 |
+
#result {
|
9 |
+
position: fixed;
|
10 |
+
left: 50%;
|
11 |
+
top: 50%;
|
12 |
+
transform: translate(-50%, -50%);
|
13 |
+
}
|
14 |
+
|
15 |
+
.word {
|
16 |
+
text-align: center;
|
17 |
+
opacity: 1;
|
18 |
+
font-family: 'Roboto Mono', monospace;
|
19 |
+
font-size: xx-large;
|
20 |
+
}
|
public/index.html
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<meta name="viewport" content="width=device-width, initial-scale=1" charset="UTF-8" />
|
3 |
+
<html lang="en">
|
4 |
+
<head>
|
5 |
+
<title>Real-time ASL Translator</title>
|
6 |
+
<link rel="preconnect" href="https://fonts.googleapis.com" />
|
7 |
+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
|
8 |
+
<link href="https://fonts.googleapis.com/css2?family=Roboto+Mono:wght@500&display=swap" rel="stylesheet" />
|
9 |
+
<link rel="stylesheet" href="index.css" />
|
10 |
+
<script src="https://cdnjs.cloudflare.com/ajax/libs/gsap/3.6.1/gsap.min.js"></script>
|
11 |
+
|
12 |
+
<div id="result"></div>
|
13 |
+
|
14 |
+
<script type="text/javascript" charset="utf-8">
|
15 |
+
const result_element = document.getElementById('result')
|
16 |
+
let previous_result = ''
|
17 |
+
result_element.style.opacity = '1'
|
18 |
+
|
19 |
+
const timestep = 1 / 60
|
20 |
+
let opacity_progress = 1
|
21 |
+
|
22 |
+
const ease_in_expo = (progress_value) =>
|
23 |
+
progress_value == 0.0 ? 0.0 : Math.pow(2.0, 10.0 * progress_value - 10.0)
|
24 |
+
|
25 |
+
setInterval(() => {
|
26 |
+
opacity_progress -= 0.0025
|
27 |
+
result_element.style.opacity = String(ease_in_expo(opacity_progress))
|
28 |
+
if (Number(result_element.style.opacity) <= 0.05) {
|
29 |
+
result_element.replaceChildren()
|
30 |
+
}
|
31 |
+
}, timestep * 1000)
|
32 |
+
|
33 |
+
setInterval(async () => {
|
34 |
+
const result = await fetch('http://localhost:5000/')
|
35 |
+
|
36 |
+
if (result.status !== 200) return
|
37 |
+
const result_text = await result.text()
|
38 |
+
|
39 |
+
if (result_text === '') return
|
40 |
+
if (result_text !== previous_result) {
|
41 |
+
previous_result = result_text
|
42 |
+
const word = document.createElement('span')
|
43 |
+
word.className = 'word'
|
44 |
+
word.innerHTML = result_text
|
45 |
+
result_element.appendChild(word)
|
46 |
+
opacity_progress = 1
|
47 |
+
|
48 |
+
gsap.fromTo(
|
49 |
+
word,
|
50 |
+
{
|
51 |
+
autoAlpha: 0,
|
52 |
+
filter: 'blur(10px)',
|
53 |
+
y: 40
|
54 |
+
},
|
55 |
+
{
|
56 |
+
autoAlpha: 1,
|
57 |
+
filter: 'blur(0px)',
|
58 |
+
y: 0,
|
59 |
+
ease: 'Expo.easeOut',
|
60 |
+
duration: 0.5
|
61 |
+
}
|
62 |
+
)
|
63 |
+
}
|
64 |
+
}, 250)
|
65 |
+
</script>
|
66 |
+
</head>
|
67 |
+
</html>
|
requirements.txt
CHANGED
@@ -1,9 +1,14 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
opencv >= 4.6.0
|
2 |
+
pympi-ling
|
3 |
+
intervaltree
|
4 |
+
zsvision
|
5 |
+
mergedeep
|
6 |
+
humanize
|
7 |
+
mock
|
8 |
+
tqdm
|
9 |
+
tensorboard
|
10 |
+
scipy
|
11 |
+
pillow
|
12 |
+
scikit-learn
|
13 |
+
frozendict
|
14 |
+
numpy
|
resources/demo.gif
ADDED
Git LFS Details
|