supArs commited on
Commit
3afff35
1 Parent(s): 7914004

Upload 19 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ app/checkpoints/xaa filter=lfs diff=lfs merge=lfs -text
37
+ app/checkpoints/xab filter=lfs diff=lfs merge=lfs -text
38
+ app/checkpoints/xac filter=lfs diff=lfs merge=lfs -text
39
+ app/checkpoints/xad filter=lfs diff=lfs merge=lfs -text
40
+ app/checkpoints/xae filter=lfs diff=lfs merge=lfs -text
41
+ app/checkpoints/xaf filter=lfs diff=lfs merge=lfs -text
42
+ resources/demo.gif filter=lfs diff=lfs merge=lfs -text
.prettierrc ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "printWidth": 120,
3
+ "tabWidth": 2,
4
+ "useTabs": false,
5
+ "semi": false,
6
+ "singleQuote": true,
7
+ "trailingComma": "none",
8
+ "bracketSpacing": true,
9
+ "bracketSameLine": true,
10
+ "arrowParens": "always"
11
+ }
app/__init__.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sys import argv
2
+ from time import sleep
3
+
4
+ import cv2 as cv
5
+
6
+ from app.camera import Camera
7
+ from app.server import HTTPDaemon
8
+ from app.translator import Translator
9
+
10
+
11
+ def camera_loop(camera: Camera, translator: Translator):
12
+
13
+ retry_count = 0
14
+
15
+ while not camera.is_capturing():
16
+ if retry_count > 3:
17
+ raise Exception("No camera frames found.")
18
+
19
+ retry_count += 1
20
+ sleep(1)
21
+
22
+ while True:
23
+ cv.imshow('Input', camera.buffer[-1])
24
+ translator.video_to_asl(camera.buffer)
25
+
26
+ if cv.waitKey(1) == 27:
27
+ break
28
+
29
+ cv.destroyAllWindows()
30
+
31
+
32
+ def main(translator: Translator):
33
+
34
+ with Camera(0, 64) as camera:
35
+ camera_loop(camera, translator)
36
+
37
+
38
+ def init_server():
39
+
40
+ host = 'localhost'
41
+ port = parse_args()
42
+ translator = Translator(confidence=0.7)
43
+
44
+ with HTTPDaemon(host, port, translator):
45
+ try:
46
+ main(translator)
47
+
48
+ except KeyboardInterrupt:
49
+ print("\nManual exit detected.")
50
+
51
+ finally:
52
+ print("Exiting..")
53
+
54
+
55
+ def parse_args() -> int:
56
+
57
+ try:
58
+ return 5000 if len(argv) < 2 else int(argv[1])
59
+
60
+ except ValueError:
61
+ print("\nPort must be an integer.\ne.g. python server.py 5000\n")
62
+ raise
app/camera/__init__.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import deque
2
+ from threading import Thread
3
+
4
+ from cv2 import VideoCapture
5
+ from typing_extensions import Self
6
+
7
+
8
+ class Camera:
9
+
10
+ def __init__(self, capture_id: int=0, buffer_size: int=1):
11
+
12
+ self.capture_id = capture_id
13
+ self.buffer_size = buffer_size
14
+ self.capture: VideoCapture
15
+ self.capture_thread: Thread
16
+ self.buffer = deque([], buffer_size)
17
+ self.stop_capture = False
18
+
19
+
20
+ def __enter__(self) -> Self:
21
+
22
+ self.capture = VideoCapture(self.capture_id)
23
+
24
+ if not self.capture.isOpened():
25
+ raise IOError("Unable to open device.")
26
+
27
+ self.capture_thread = Thread(target=self.start_capture)
28
+ self.capture_thread.start()
29
+
30
+ return self
31
+
32
+
33
+ def __exit__(self, *_):
34
+
35
+ self.stop_capture = True
36
+ self.capture_thread.join()
37
+ self.capture.release()
38
+
39
+
40
+ def start_capture(self):
41
+
42
+ while True:
43
+ if self.stop_capture:
44
+ break
45
+
46
+ _, frame = self.capture.read()
47
+
48
+ if len(self.buffer) == self.buffer_size:
49
+ self.buffer.popleft()
50
+
51
+ self.buffer.append(frame)
52
+
53
+
54
+ def is_capturing(self) -> bool:
55
+
56
+ print(f"Filling buffer: {len(self.buffer)}/{self.buffer_size}")
57
+ return len(self.buffer) == self.buffer_size
58
+
app/checkpoints/xaa ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:07c12b25f4ba3ce4e354282940ffa010b063206eb7c676a9a404c125a657de4d
3
+ size 20971520
app/checkpoints/xab ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bcdb8eba8a143d5e0cf7509f66a0a560262d8b3117015e6d9b9449e2cd865ff4
3
+ size 20971520
app/checkpoints/xac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57d719a7c1c3ce68484a3fa28557aba9b91e60ed6919a7b34ba217f37f4ac477
3
+ size 20971520
app/checkpoints/xad ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d63027be73a90be23789000844df7344dd4cd353195ee61a8a28cfd6b7af6565
3
+ size 20971520
app/checkpoints/xae ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bf557f3e89ab18a875d7be9f94e0955f8c70117792260dcda6590e3c0462924
3
+ size 20971520
app/checkpoints/xaf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:935619ea8ee2a43891643a6111479d35328a889b238fa13b760acd538d8c4f71
3
+ size 9996102
app/config/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ class Config:
2
+
3
+ checkpoint_path = 'app/checkpoints/model.pth.tar'
4
+ vocabulary_path = "app/data/info.pkl"
5
+ number_of_classes = 2000
6
+ number_of_frames = 64
7
+ stride = 8
8
+ batch_size = 10
9
+ topk = 1
app/data/info.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a38d2b934b7bb3923b4ffad96789414a346cc4f38af26943077dc1fb66dc068
3
+ size 2182947
app/models/__init__.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from math import ceil
2
+
3
+ from torch import cat
4
+ from torch.nn import AvgPool3d, BatchNorm3d, Conv3d, Dropout, MaxPool3d, Module
5
+ from torch.nn.functional import pad, relu
6
+
7
+
8
+ class MaxPool3dSamePadding(MaxPool3d):
9
+ def compute_pad(self, dim, s):
10
+ if s % self.stride[dim] == 0:
11
+ return max(self.kernel_size[dim] - self.stride[dim], 0)
12
+ else:
13
+ return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
14
+
15
+ def forward(self, x):
16
+ # compute 'same' padding
17
+ _, _, t, h, w = x.size()
18
+ # print t,h,ms.shaw
19
+ # out_t = np.ceil(float(t) / float(self.stride[0]))
20
+ # out_h = np.ceil(float(h) / float(self.stride[1]))
21
+ # out_w = np.ceil(float(w) / float(self.stride[2]))
22
+ # print out_t, out_h, out_w
23
+ pad_t = self.compute_pad(0, t)
24
+ pad_h = self.compute_pad(1, h)
25
+ pad_w = self.compute_pad(2, w)
26
+ # print pad_t, pad_h, pad_w
27
+
28
+ pad_t_f = pad_t // 2
29
+ pad_t_b = pad_t - pad_t_f
30
+ pad_h_f = pad_h // 2
31
+ pad_h_b = pad_h - pad_h_f
32
+ pad_w_f = pad_w // 2
33
+ pad_w_b = pad_w - pad_w_f
34
+
35
+ padding = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
36
+ x = pad(x, padding)
37
+ return super(MaxPool3dSamePadding, self).forward(x)
38
+
39
+
40
+ class Unit3D(Module):
41
+
42
+ def __init__(
43
+ self,
44
+ in_channels,
45
+ output_channels,
46
+ kernel_shape=(1, 1, 1),
47
+ stride=(1, 1, 1),
48
+ padding=0,
49
+ activation_fn=relu,
50
+ use_batch_norm=True,
51
+ use_bias=False,
52
+ name="unit_3d",
53
+ num_domains=1,
54
+ ):
55
+
56
+ """Initializes Unit3D module."""
57
+ super(Unit3D, self).__init__()
58
+
59
+ self._output_channels = output_channels
60
+ self._kernel_shape = kernel_shape
61
+ self._stride = stride
62
+ self._use_batch_norm = use_batch_norm
63
+ self._num_domains = num_domains
64
+ self._activation_fn = activation_fn
65
+ self._use_bias = use_bias
66
+ self.name = name
67
+ self.padding = padding
68
+
69
+ self.conv3d = Conv3d(
70
+ in_channels=in_channels,
71
+ out_channels=self._output_channels,
72
+ kernel_size=self._kernel_shape,
73
+ stride=self._stride,
74
+ padding=0, # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
75
+ bias=self._use_bias,
76
+ )
77
+
78
+ if self._use_batch_norm:
79
+ if self._num_domains == 1:
80
+ self.bn = BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
81
+
82
+
83
+ def compute_pad(self, dim, s):
84
+ if s % self._stride[dim] == 0:
85
+ return max(self._kernel_shape[dim] - self._stride[dim], 0)
86
+ else:
87
+ return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
88
+
89
+
90
+ def forward(self, x):
91
+ # compute 'same' padding
92
+ _, _, t, h, w = x.size()
93
+ # print t,h,w
94
+ # out_t = np.ceil(float(t) / float(self._stride[0]))
95
+ # out_h = np.ceil(float(h) / float(self._stride[1]))
96
+ # out_w = np.ceil(float(w) / float(self._stride[2]))
97
+ # print out_t, out_h, out_w
98
+ pad_t = self.compute_pad(0, t)
99
+ pad_h = self.compute_pad(1, h)
100
+ pad_w = self.compute_pad(2, w)
101
+ # print pad_t, pad_h, pad_w
102
+
103
+ pad_t_f = pad_t // 2
104
+ pad_t_b = pad_t - pad_t_f
105
+ pad_h_f = pad_h // 2
106
+ pad_h_b = pad_h - pad_h_f
107
+ pad_w_f = pad_w // 2
108
+ pad_w_b = pad_w - pad_w_f
109
+
110
+ padding = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
111
+ # print x.size()
112
+ # print pad
113
+ x = pad(x, padding)
114
+ # print x.size()
115
+
116
+ x = self.conv3d(x)
117
+ if self._use_batch_norm:
118
+ x = self.bn(x)
119
+ if self._activation_fn is not None:
120
+ x = self._activation_fn(x)
121
+ return x
122
+
123
+
124
+ class InceptionModule(Module):
125
+ def __init__(self, in_channels, out_channels, name, num_domains=1):
126
+ super(InceptionModule, self).__init__()
127
+
128
+ self.b0 = Unit3D(
129
+ in_channels=in_channels,
130
+ output_channels=out_channels[0],
131
+ kernel_shape=[1, 1, 1],
132
+ padding=0,
133
+ name=name + "/Branch_0/Conv3d_0a_1x1",
134
+ )
135
+ self.b1a = Unit3D(
136
+ in_channels=in_channels,
137
+ output_channels=out_channels[1],
138
+ kernel_shape=[1, 1, 1],
139
+ padding=0,
140
+ name=name + "/Branch_1/Conv3d_0a_1x1",
141
+ )
142
+ self.b1b = Unit3D(
143
+ in_channels=out_channels[1],
144
+ output_channels=out_channels[2],
145
+ kernel_shape=[3, 3, 3],
146
+ name=name + "/Branch_1/Conv3d_0b_3x3",
147
+ )
148
+ self.b2a = Unit3D(
149
+ in_channels=in_channels,
150
+ output_channels=out_channels[3],
151
+ kernel_shape=[1, 1, 1],
152
+ padding=0,
153
+ name=name + "/Branch_2/Conv3d_0a_1x1",
154
+ )
155
+ self.b2b = Unit3D(
156
+ in_channels=out_channels[3],
157
+ output_channels=out_channels[4],
158
+ kernel_shape=[3, 3, 3],
159
+ name=name + "/Branch_2/Conv3d_0b_3x3",
160
+ )
161
+ self.b3a = MaxPool3dSamePadding(
162
+ kernel_size=[3, 3, 3], stride=(1, 1, 1), padding=0
163
+ )
164
+ self.b3b = Unit3D(
165
+ in_channels=in_channels,
166
+ output_channels=out_channels[5],
167
+ kernel_shape=[1, 1, 1],
168
+ padding=0,
169
+ name=name + "/Branch_3/Conv3d_0b_1x1",
170
+ )
171
+ self.name = name
172
+
173
+ def forward(self, x):
174
+ b0 = self.b0(x)
175
+ b1 = self.b1b(self.b1a(x))
176
+ b2 = self.b2b(self.b2a(x))
177
+ b3 = self.b3b(self.b3a(x))
178
+ return cat([b0, b1, b2, b3], dim=1)
179
+
180
+
181
+ class InceptionI3d(Module):
182
+ """Inception-v1 I3D architecture.
183
+ The model is introduced in:
184
+ Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
185
+ Joao Carreira, Andrew Zisserman
186
+ https://arxiv.org/pdf/1705.07750v1.pdf.
187
+ See also the Inception architecture, introduced in:
188
+ Going deeper with convolutions
189
+ Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
190
+ Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
191
+ http://arxiv.org/pdf/1409.4842v1.pdf.
192
+ """
193
+
194
+ # Endpoints of the model in order. During construction, all the endpoints up
195
+ # to a designated `final_endpoint` are returned in a dictionary as the
196
+ # second return value.
197
+ VALID_ENDPOINTS = (
198
+ "Conv3d_1a_7x7",
199
+ "MaxPool3d_2a_3x3",
200
+ "Conv3d_2b_1x1",
201
+ "Conv3d_2c_3x3",
202
+ "MaxPool3d_3a_3x3",
203
+ "Mixed_3b",
204
+ "Mixed_3c",
205
+ "MaxPool3d_4a_3x3",
206
+ "Mixed_4b",
207
+ "Mixed_4c",
208
+ "Mixed_4d",
209
+ "Mixed_4e",
210
+ "Mixed_4f",
211
+ "MaxPool3d_5a_2x2",
212
+ "Mixed_5b",
213
+ "Mixed_5c",
214
+ "Logits",
215
+ "Predictions",
216
+ )
217
+
218
+ def __init__(
219
+ self,
220
+ num_classes=400,
221
+ spatiotemporal_squeeze=True,
222
+ final_endpoint="Logits",
223
+ name="inception_i3d",
224
+ in_channels=3,
225
+ dropout_keep_prob=0.5,
226
+ num_in_frames=64,
227
+ include_embds=False,
228
+ ):
229
+ """Initializes I3D model instance.
230
+ Args:
231
+ num_classes: The number of outputs in the logit layer (default 400, which
232
+ matches the Kinetics dataset).
233
+ spatiotemporal_squeeze: Whether to squeeze the 2 spatial and 1 temporal dimensions for the logits
234
+ before returning (default True).
235
+ final_endpoint: The model contains many possible endpoints.
236
+ `final_endpoint` specifies the last endpoint for the model to be built
237
+ up to. In addition to the output at `final_endpoint`, all the outputs
238
+ at endpoints up to `final_endpoint` will also be returned, in a
239
+ dictionary. `final_endpoint` must be one of
240
+ InceptionI3d.VALID_ENDPOINTS (default 'Logits').
241
+ in_channels: Number of input channels (default 3 for RGB).
242
+ dropout_keep_prob: Dropout probability (default 0.5).
243
+ name: A string (optional). The name of this module.
244
+ num_in_frames: Number of input frames (default 64).
245
+ include_embds: Whether to return embeddings (default False).
246
+ Raises:
247
+ ValueError: if `final_endpoint` is not recognized.
248
+ """
249
+
250
+ if final_endpoint not in self.VALID_ENDPOINTS:
251
+ raise ValueError("Unknown final endpoint %s" % final_endpoint)
252
+
253
+ super().__init__()
254
+ self._num_classes = num_classes
255
+ self._spatiotemporal_squeeze = spatiotemporal_squeeze
256
+ self._final_endpoint = final_endpoint
257
+ self.include_embds = include_embds
258
+ self.logits = None
259
+
260
+ if self._final_endpoint not in self.VALID_ENDPOINTS:
261
+ raise ValueError("Unknown final endpoint %s" % self._final_endpoint)
262
+
263
+ self.end_points = {}
264
+ end_point = "Conv3d_1a_7x7"
265
+ self.end_points[end_point] = Unit3D(
266
+ in_channels=in_channels,
267
+ output_channels=64,
268
+ kernel_shape=[7, 7, 7],
269
+ stride=(2, 2, 2),
270
+ padding=(3, 3, 3),
271
+ name=name + end_point,
272
+ )
273
+ if self._final_endpoint == end_point:
274
+ return
275
+
276
+ end_point = "MaxPool3d_2a_3x3"
277
+ self.end_points[end_point] = MaxPool3dSamePadding(
278
+ kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0
279
+ )
280
+ if self._final_endpoint == end_point:
281
+ return
282
+
283
+ end_point = "Conv3d_2b_1x1"
284
+ self.end_points[end_point] = Unit3D(
285
+ in_channels=64,
286
+ output_channels=64,
287
+ kernel_shape=[1, 1, 1],
288
+ padding=0,
289
+ name=name + end_point,
290
+ )
291
+ if self._final_endpoint == end_point:
292
+ return
293
+
294
+ end_point = "Conv3d_2c_3x3"
295
+ self.end_points[end_point] = Unit3D(
296
+ in_channels=64,
297
+ output_channels=192,
298
+ kernel_shape=[3, 3, 3],
299
+ padding=1,
300
+ name=name + end_point,
301
+ )
302
+ if self._final_endpoint == end_point:
303
+ return
304
+
305
+ end_point = "MaxPool3d_3a_3x3"
306
+ self.end_points[end_point] = MaxPool3dSamePadding(
307
+ kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0
308
+ )
309
+ if self._final_endpoint == end_point:
310
+ return
311
+
312
+ end_point = "Mixed_3b"
313
+ self.end_points[end_point] = InceptionModule(
314
+ 192, [64, 96, 128, 16, 32, 32], name + end_point,
315
+ )
316
+ if self._final_endpoint == end_point:
317
+ return
318
+
319
+ end_point = "Mixed_3c"
320
+ self.end_points[end_point] = InceptionModule(
321
+ 256, [128, 128, 192, 32, 96, 64], name + end_point,
322
+ )
323
+ if self._final_endpoint == end_point:
324
+ return
325
+
326
+ end_point = "MaxPool3d_4a_3x3"
327
+ self.end_points[end_point] = MaxPool3dSamePadding(
328
+ kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0
329
+ )
330
+ if self._final_endpoint == end_point:
331
+ return
332
+
333
+ end_point = "Mixed_4b"
334
+ self.end_points[end_point] = InceptionModule(
335
+ 128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point,
336
+ )
337
+ if self._final_endpoint == end_point:
338
+ return
339
+
340
+ end_point = "Mixed_4c"
341
+ self.end_points[end_point] = InceptionModule(
342
+ 192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point,
343
+ )
344
+ if self._final_endpoint == end_point:
345
+ return
346
+
347
+ end_point = "Mixed_4d"
348
+ self.end_points[end_point] = InceptionModule(
349
+ 160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point,
350
+ )
351
+ if self._final_endpoint == end_point:
352
+ return
353
+
354
+ end_point = "Mixed_4e"
355
+ self.end_points[end_point] = InceptionModule(
356
+ 128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point,
357
+ )
358
+ if self._final_endpoint == end_point:
359
+ return
360
+
361
+ end_point = "Mixed_4f"
362
+ self.end_points[end_point] = InceptionModule(
363
+ 112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point,
364
+ )
365
+ if self._final_endpoint == end_point:
366
+ return
367
+
368
+ end_point = "MaxPool3d_5a_2x2"
369
+ self.end_points[end_point] = MaxPool3dSamePadding(
370
+ kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0
371
+ )
372
+ if self._final_endpoint == end_point:
373
+ return
374
+
375
+ end_point = "Mixed_5b"
376
+ self.end_points[end_point] = InceptionModule(
377
+ 256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point,
378
+ )
379
+ if self._final_endpoint == end_point:
380
+ return
381
+
382
+ end_point = "Mixed_5c"
383
+ self.end_points[end_point] = InceptionModule(
384
+ 256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point,
385
+ )
386
+ if self._final_endpoint == end_point:
387
+ return
388
+
389
+ end_point = "Logits"
390
+
391
+ last_duration = int(ceil(num_in_frames / 8)) # 8
392
+ last_size = 7 # int(ceil(sample_width / 32)) # this is for 224
393
+ self.avgpool = AvgPool3d((last_duration, last_size, last_size), stride=1)
394
+
395
+ self.dropout = Dropout(dropout_keep_prob)
396
+
397
+ self.logits = Unit3D(
398
+ in_channels=384 + 384 + 128 + 128,
399
+ output_channels=self._num_classes,
400
+ kernel_shape=[1, 1, 1],
401
+ padding=0,
402
+ activation_fn=None,
403
+ use_batch_norm=False,
404
+ use_bias=True,
405
+ name="logits",
406
+ )
407
+
408
+ self.build()
409
+
410
+ def replace_logits(self, num_classes):
411
+ self._num_classes = num_classes
412
+ self.logits = Unit3D(
413
+ in_channels=384 + 384 + 128 + 128,
414
+ output_channels=self._num_classes,
415
+ kernel_shape=[1, 1, 1],
416
+ padding=0,
417
+ activation_fn=None,
418
+ use_batch_norm=False,
419
+ use_bias=True,
420
+ name="logits",
421
+ )
422
+
423
+ def build(self):
424
+ for k in self.end_points.keys():
425
+ self.add_module(k, self.end_points[k])
426
+
427
+ def forward(self, x):
428
+ for end_point in self.VALID_ENDPOINTS:
429
+ if end_point in self.end_points:
430
+ x = self._modules[end_point](x)
431
+ # [batch x featuredim x 1 x 1 x 1]
432
+ embds = self.dropout(self.avgpool(x))
433
+
434
+ # [batch x classes x 1 x 1 x 1]
435
+ x = self.logits(embds)
436
+ if self._spatiotemporal_squeeze:
437
+ # [batch x classes]
438
+ logits = x.squeeze(3).squeeze(3).squeeze(2)
439
+
440
+ # logits [batch X classes]
441
+ if self.include_embds:
442
+ return {"logits": logits, "embds": embds}
443
+ else:
444
+ return {"logits": logits}
app/server/__init__.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from http.server import BaseHTTPRequestHandler, HTTPServer
2
+ from threading import Thread
3
+
4
+ from app.translator import Translator
5
+
6
+
7
+ class Server(BaseHTTPRequestHandler):
8
+
9
+ def __init__(self, translator: Translator):
10
+
11
+ self.translator = translator
12
+
13
+
14
+ def __call__(self, *args, **kwargs):
15
+
16
+ super().__init__(*args, **kwargs)
17
+
18
+
19
+ def do_GET(self):
20
+
21
+ self.send_response(200)
22
+ self.send_header('Access-Control-Allow-Origin', '*')
23
+ self.end_headers()
24
+ self.wfile.write(self.translator.result.encode('utf-8'))
25
+
26
+
27
+ class HTTPDaemon:
28
+
29
+ def __init__(self, host: str, port: int, translator: Translator):
30
+
31
+ self.host = host
32
+ self.port = port
33
+ self.httpd = HTTPServer((self.host, self.port), Server(translator))
34
+
35
+ self.server_thread: Thread
36
+
37
+
38
+ def __enter__(self):
39
+
40
+ print(f"Serving HTTP on {self.host} port {self.port} (http://{self.host}:{self.port}/)..")
41
+ self.server_thread = Thread(target=self.httpd.serve_forever)
42
+ self.server_thread.start()
43
+
44
+
45
+ def __exit__(self, *_):
46
+
47
+ print("\nServer closing..")
48
+ self.httpd.shutdown()
49
+ self.httpd.server_close()
50
+ self.server_thread.join()
app/translator/__init__.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import pickle
3
+ from collections import deque
4
+ from os.path import exists
5
+ from subprocess import call, DEVNULL
6
+
7
+ import cv2 as cv
8
+ import numpy as np
9
+ import scipy
10
+ import torch
11
+ from torch.nn import DataParallel, Module
12
+
13
+ from app.config import Config
14
+ from app.models import InceptionI3d
15
+
16
+
17
+ class Translator:
18
+
19
+ def __init__(self, confidence: float):
20
+
21
+ self.confidence = confidence
22
+ self.model = self.load_model(Config.checkpoint_path, Config.number_of_classes, Config.number_of_frames)
23
+ self.word_data = self.load_vocabulary(Config.vocabulary_path)
24
+ self.result = ""
25
+
26
+
27
+ def resize_generic(self, img, oheight, owidth, interp="bilinear", is_flow=False):
28
+ """
29
+ Args
30
+ inp: numpy array: RGB image (H, W, 3) | video with 3*nframes (H, W, 3*nframes)
31
+ | single channel image (H, W, 1) | -- not supported: video with (nframes, 3, H, W)
32
+ """
33
+
34
+ # resized_image = cv.resize(image, (100, 50))
35
+ ht = img.shape[0]
36
+ chn = img.shape[2]
37
+
38
+ if chn == 1:
39
+ resized_img = scipy.misc.imresize(
40
+ img.squeeze(), [oheight, owidth], interp=interp, mode="F"
41
+ ).reshape((oheight, owidth, chn))
42
+ elif chn == 3:
43
+ # resized_img = scipy.misc.imresize(img, [oheight, owidth], interp=interp) # mode='F' gives an error for 3 channels
44
+ resized_img = cv.resize(img, (owidth, oheight)) # inverted compared to scipy
45
+ elif chn == 2:
46
+ # assert(is_flow)
47
+ resized_img = np.zeros((oheight, owidth, chn), dtype=img.dtype)
48
+ for t in range(chn):
49
+ # resized_img[:, :, t] = scipy.misc.imresize(img[:, :, t], [oheight, owidth], interp=interp)
50
+ # resized_img[:, :, t] = scipy.misc.imresize(img[:, :, t], [oheight, owidth], interp=interp, mode='F')
51
+ # resized_img[:, :, t] = np.array(Image.fromarray(img[:, :, t]).resize([oheight, owidth]))
52
+ resized_img[:, :, t] = scipy.ndimage.interpolation.zoom(
53
+ img[:, :, t], [oheight, owidth]
54
+ )
55
+ else:
56
+ in_chn = 3
57
+ # Workaround, would be better to pass #frames
58
+ if chn == 16:
59
+ in_chn = 1
60
+ if chn == 32:
61
+ in_chn = 2
62
+ nframes = int(chn / in_chn)
63
+ img = img.reshape(img.shape[0], img.shape[1], in_chn, nframes)
64
+ resized_img = np.zeros((oheight, owidth, in_chn, nframes), dtype=img.dtype)
65
+ for t in range(nframes):
66
+ frame = img[:, :, :, t] # img[:, :, t*3:t*3+3]
67
+ frame = cv.resize(frame, (owidth, oheight)).reshape(
68
+ oheight, owidth, in_chn
69
+ )
70
+ # frame = scipy.misc.imresize(frame, [oheight, owidth], interp=interp)
71
+ resized_img[:, :, :, t] = frame
72
+ resized_img = resized_img.reshape(
73
+ resized_img.shape[0], resized_img.shape[1], chn
74
+ )
75
+
76
+ if is_flow:
77
+ # print(oheight / ht)
78
+ # print(owidth / wd)
79
+ resized_img = resized_img * oheight / ht
80
+ return resized_img
81
+
82
+
83
+ def color_normalize(self, x, mean, std):
84
+ """Normalize a tensor of images by subtracting (resp. dividing) by the mean (resp.
85
+ std. deviation) statistics of a dataset in RGB space.
86
+ """
87
+ if x.dim() in {3, 4}:
88
+ if x.size(0) == 1:
89
+ x = x.repeat(3, 1, 1)
90
+ assert x.size(0) == 3, "For single video format, expected RGB along first dim"
91
+ for t, m, s in zip(x, mean, std):
92
+ t.sub_(m)
93
+ t.div_(s)
94
+ elif x.dim() == 5:
95
+ assert (
96
+ x.shape[1] == 3
97
+ ), "For batched video format, expected RGB along second dim"
98
+ x[:, 0].sub_(mean[0]).div_(std[0])
99
+ x[:, 1].sub_(mean[1]).div_(std[1])
100
+ x[:, 2].sub_(mean[2]).div_(std[2])
101
+ return x
102
+
103
+
104
+ def to_torch(self, ndarray):
105
+
106
+ if type(ndarray).__module__ == "numpy":
107
+ return torch.from_numpy(ndarray)
108
+ elif not torch.is_tensor(ndarray):
109
+ raise ValueError(f"Cannot convert {type(ndarray)} to torch tensor")
110
+ return ndarray
111
+
112
+
113
+ def to_numpy(self, tensor):
114
+
115
+ if torch.is_tensor(tensor):
116
+ return tensor.cpu().numpy()
117
+ elif type(tensor).__module__ != "numpy":
118
+ raise ValueError(f"Cannot convert {type(tensor)} to numpy array")
119
+ return tensor
120
+
121
+
122
+ def im_to_numpy(self, img):
123
+
124
+ img = self.to_numpy(img)
125
+ img = np.transpose(img, (1, 2, 0)) # H*W*C
126
+
127
+ return img
128
+
129
+
130
+ def im_to_torch(self, img):
131
+
132
+ img = np.transpose(img, (2, 0, 1)) # C*H*W
133
+ img = self.to_torch(img).float()
134
+
135
+ return img / 255 if img.max() > 1 else img
136
+
137
+
138
+ def load_model(self, checkpoint_path: str, number_of_classes: int, number_of_frames: int) -> Module:
139
+
140
+ model = DataParallel(InceptionI3d(
141
+ number_of_classes,
142
+ spatiotemporal_squeeze=True,
143
+ final_endpoint='Logits',
144
+ name="inception_i3d",
145
+ in_channels=3,
146
+ dropout_keep_prob=0.5,
147
+ num_in_frames=number_of_frames
148
+ )).cuda()
149
+
150
+ if not exists(Config.checkpoint_path):
151
+ call(f'cat app/checkpoints/* >> {Config.checkpoint_path}', shell=True, stdout=DEVNULL)
152
+
153
+ checkpoint = torch.load(checkpoint_path)
154
+ model.load_state_dict(checkpoint['state_dict'])
155
+ model.eval()
156
+
157
+ return model
158
+
159
+
160
+ def load_vocabulary(self, vocabulary_path: str) -> dict:
161
+
162
+ with open(vocabulary_path, 'rb') as file:
163
+ return pickle.load(file)
164
+
165
+
166
+ def prepare_input(self, video: deque, input_resolution: int=224, resize_resolution: int=256, mean: torch.Tensor=0.5*torch.ones(3), std: torch.Tensor=1.0*torch.ones(3)) -> np.ndarray:
167
+
168
+ video_tensor = torch.stack(
169
+ [self.im_to_torch(frame[:, :, [2, 1, 0]]) for frame in video]
170
+ ).permute(1, 0, 2, 3)
171
+
172
+ iC, iF, _, _ = video_tensor.shape
173
+ video_tensor_resized = np.zeros((iF, resize_resolution, resize_resolution, iC))
174
+ for t in range(iF):
175
+ tmp = video_tensor[:, t, :, :]
176
+ tmp = self.resize_generic(
177
+ self.im_to_numpy(tmp), resize_resolution, resize_resolution, interp="bilinear", is_flow=False
178
+ )
179
+ video_tensor_resized[t] = tmp
180
+ video_tensor_resized = np.transpose(video_tensor_resized, (3, 0, 1, 2))
181
+ # Center crop coords
182
+ ulx = int((resize_resolution - input_resolution) / 2)
183
+ uly = int((resize_resolution - input_resolution) / 2)
184
+ # Crop 256x256
185
+ video_tensor_resized = video_tensor_resized[:, :, uly : uly + input_resolution, ulx : ulx + input_resolution]
186
+ video_tensor_resized = self.to_torch(video_tensor_resized).float()
187
+ assert video_tensor_resized.max() <= 1
188
+ video_tensor_resized = self.color_normalize(video_tensor_resized, mean, std)
189
+ return video_tensor_resized
190
+
191
+
192
+ def sliding_windows(self, input_video: torch.Tensor, number_of_frames: int, stride: int) -> torch.Tensor:
193
+
194
+ """
195
+ Return sliding windows and corresponding (middle) timestamp
196
+ """
197
+ C, nFrames, H, W = input_video.shape
198
+ # If needed, pad to the minimum clip length
199
+ if nFrames < number_of_frames:
200
+ rgb_ = torch.zeros(C, number_of_frames, H, W)
201
+ rgb_[:, :nFrames] = input_video
202
+ rgb_[:, nFrames:] = input_video[:, -1].unsqueeze(1)
203
+ input_video = rgb_
204
+ nFrames = input_video.shape[1]
205
+
206
+ num_clips = math.ceil((nFrames - number_of_frames) / stride) + 1
207
+
208
+ rgb_slided = torch.zeros(num_clips, 3, number_of_frames, H, W)
209
+ # For each clip
210
+ for j in range(num_clips):
211
+ # Check if num_clips becomes 0
212
+ stride_j = j * stride
213
+ actual_clip_length = min(number_of_frames, nFrames - stride_j)
214
+ t_beg = stride_j if actual_clip_length == number_of_frames else nFrames - number_of_frames
215
+ rgb_slided[j] = input_video[:, t_beg : t_beg + number_of_frames, :, :]
216
+
217
+ return rgb_slided
218
+
219
+
220
+ def video_to_asl(self, video: deque):
221
+
222
+ input_video = self.prepare_input(video)
223
+ input_sliding_window = self.sliding_windows(input_video, Config.number_of_frames, Config.stride)
224
+
225
+ num_clips = input_sliding_window.shape[0]
226
+ # Group the clips into batches
227
+ num_batches = math.ceil(num_clips / Config.batch_size)
228
+ raw_scores = np.empty((0, Config.number_of_classes), dtype=float)
229
+ for b in range(num_batches):
230
+ inp = input_sliding_window[b * Config.batch_size : (b + 1) * Config.batch_size]
231
+ # Forward pass
232
+ out = self.model(inp)
233
+ raw_scores = np.append(raw_scores, out["logits"].cpu().detach().numpy(), axis=0)
234
+ prob_scores = scipy.special.softmax(raw_scores, axis=1)
235
+ prob_sorted = np.sort(prob_scores, axis=1)[:, ::-1]
236
+ pred_sorted = np.argsort(prob_scores, axis=1)[:, ::-1]
237
+
238
+ word_topk = np.empty((Config.topk, num_clips), dtype=object)
239
+ for k in range(Config.topk):
240
+ for i, p in enumerate(pred_sorted[:, k]):
241
+ word_topk[k, i] = self.word_data["words"][p]
242
+ prob_topk = prob_sorted[:, :Config.topk].transpose()
243
+
244
+ # print(np.array([word_topk, prob_topk]).transpose())
245
+ self.result = "" if prob_topk[0, 0] <= self.confidence else word_topk[0, 0]
main.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from app import init_server
2
+
3
+ if __name__ == '__main__':
4
+ init_server()
public/index.css ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ span {
2
+ line-height: 1.8;
3
+ word-wrap: normal;
4
+ display: inline-block;
5
+ padding-right: 0.3em;
6
+ }
7
+
8
+ #result {
9
+ position: fixed;
10
+ left: 50%;
11
+ top: 50%;
12
+ transform: translate(-50%, -50%);
13
+ }
14
+
15
+ .word {
16
+ text-align: center;
17
+ opacity: 1;
18
+ font-family: 'Roboto Mono', monospace;
19
+ font-size: xx-large;
20
+ }
public/index.html ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <meta name="viewport" content="width=device-width, initial-scale=1" charset="UTF-8" />
3
+ <html lang="en">
4
+ <head>
5
+ <title>Real-time ASL Translator</title>
6
+ <link rel="preconnect" href="https://fonts.googleapis.com" />
7
+ <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
8
+ <link href="https://fonts.googleapis.com/css2?family=Roboto+Mono:wght@500&display=swap" rel="stylesheet" />
9
+ <link rel="stylesheet" href="index.css" />
10
+ <script src="https://cdnjs.cloudflare.com/ajax/libs/gsap/3.6.1/gsap.min.js"></script>
11
+
12
+ <div id="result"></div>
13
+
14
+ <script type="text/javascript" charset="utf-8">
15
+ const result_element = document.getElementById('result')
16
+ let previous_result = ''
17
+ result_element.style.opacity = '1'
18
+
19
+ const timestep = 1 / 60
20
+ let opacity_progress = 1
21
+
22
+ const ease_in_expo = (progress_value) =>
23
+ progress_value == 0.0 ? 0.0 : Math.pow(2.0, 10.0 * progress_value - 10.0)
24
+
25
+ setInterval(() => {
26
+ opacity_progress -= 0.0025
27
+ result_element.style.opacity = String(ease_in_expo(opacity_progress))
28
+ if (Number(result_element.style.opacity) <= 0.05) {
29
+ result_element.replaceChildren()
30
+ }
31
+ }, timestep * 1000)
32
+
33
+ setInterval(async () => {
34
+ const result = await fetch('http://localhost:5000/')
35
+
36
+ if (result.status !== 200) return
37
+ const result_text = await result.text()
38
+
39
+ if (result_text === '') return
40
+ if (result_text !== previous_result) {
41
+ previous_result = result_text
42
+ const word = document.createElement('span')
43
+ word.className = 'word'
44
+ word.innerHTML = result_text
45
+ result_element.appendChild(word)
46
+ opacity_progress = 1
47
+
48
+ gsap.fromTo(
49
+ word,
50
+ {
51
+ autoAlpha: 0,
52
+ filter: 'blur(10px)',
53
+ y: 40
54
+ },
55
+ {
56
+ autoAlpha: 1,
57
+ filter: 'blur(0px)',
58
+ y: 0,
59
+ ease: 'Expo.easeOut',
60
+ duration: 0.5
61
+ }
62
+ )
63
+ }
64
+ }, 250)
65
+ </script>
66
+ </head>
67
+ </html>
requirements.txt CHANGED
@@ -1,9 +1,14 @@
1
- flask==2.2.2
2
- Werkzeug==2.2.2
3
- protobuf==3.20.*
4
- flask-cors==3.0.10
5
- matplotlib==3.5.2
6
- numpy==1.22.3
7
- opencv-python-headless==4.5.5.64
8
- tensorflow==2.8.0
9
- uvicorn
 
 
 
 
 
 
1
+ opencv >= 4.6.0
2
+ pympi-ling
3
+ intervaltree
4
+ zsvision
5
+ mergedeep
6
+ humanize
7
+ mock
8
+ tqdm
9
+ tensorboard
10
+ scipy
11
+ pillow
12
+ scikit-learn
13
+ frozendict
14
+ numpy
resources/demo.gif ADDED

Git LFS Details

  • SHA256: fa85c217b43a58aac06841df3e7dc8afc479c5a82a6807c6b7c3b44377ad3630
  • Pointer size: 132 Bytes
  • Size of remote file: 1.7 MB