Spaces:

supArs
/

test1

Runtime error

App Files Files Community

supArs commited on Jan 26, 2024

Commit

3afff35

verified ·

1 Parent(s): 7914004

Upload 19 files

Browse files

Files changed (20) hide show

.gitattributes +7 -0
.prettierrc +11 -0
app/__init__.py +62 -0
app/camera/__init__.py +58 -0
app/checkpoints/xaa +3 -0
app/checkpoints/xab +3 -0
app/checkpoints/xac +3 -0
app/checkpoints/xad +3 -0
app/checkpoints/xae +3 -0
app/checkpoints/xaf +3 -0
app/config/__init__.py +9 -0
app/data/info.pkl +3 -0
app/models/__init__.py +444 -0
app/server/__init__.py +50 -0
app/translator/__init__.py +245 -0
main.py +4 -0
public/index.css +20 -0
public/index.html +67 -0
requirements.txt +14 -9
resources/demo.gif +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,10 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+app/checkpoints/xaa filter=lfs diff=lfs merge=lfs -text
+app/checkpoints/xab filter=lfs diff=lfs merge=lfs -text
+app/checkpoints/xac filter=lfs diff=lfs merge=lfs -text
+app/checkpoints/xad filter=lfs diff=lfs merge=lfs -text
+app/checkpoints/xae filter=lfs diff=lfs merge=lfs -text
+app/checkpoints/xaf filter=lfs diff=lfs merge=lfs -text
+resources/demo.gif filter=lfs diff=lfs merge=lfs -text

.prettierrc ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "printWidth": 120,
+  "tabWidth": 2,
+  "useTabs": false,
+  "semi": false,
+  "singleQuote": true,
+  "trailingComma": "none",
+  "bracketSpacing": true,
+  "bracketSameLine": true,
+  "arrowParens": "always"
+}

app/__init__.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from sys import argv
+from time import sleep
+import cv2 as cv
+from app.camera import Camera
+from app.server import HTTPDaemon
+from app.translator import Translator
+def camera_loop(camera: Camera, translator: Translator):
+    retry_count = 0
+    while not camera.is_capturing():
+        if retry_count > 3:
+            raise Exception("No camera frames found.")
+        retry_count += 1
+        sleep(1)
+    while True:
+        cv.imshow('Input', camera.buffer[-1])
+        translator.video_to_asl(camera.buffer)
+        if cv.waitKey(1) == 27:
+            break
+    cv.destroyAllWindows()
+def main(translator: Translator):
+    with Camera(0, 64) as camera:
+        camera_loop(camera, translator)
+def init_server():
+    host = 'localhost'
+    port = parse_args()
+    translator = Translator(confidence=0.7)
+    with HTTPDaemon(host, port, translator):
+        try:
+            main(translator)
+        except KeyboardInterrupt:
+            print("\nManual exit detected.")
+        finally:
+            print("Exiting..")
+def parse_args() -> int:
+    try:
+        return 5000 if len(argv) < 2 else int(argv[1])
+    except ValueError:
+        print("\nPort must be an integer.\ne.g. python server.py 5000\n")
+        raise

app/camera/__init__.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from collections import deque
+from threading import Thread
+from cv2 import VideoCapture
+from typing_extensions import Self
+class Camera:
+    def __init__(self, capture_id: int=0, buffer_size: int=1):
+        self.capture_id = capture_id
+        self.buffer_size = buffer_size
+        self.capture: VideoCapture
+        self.capture_thread: Thread
+        self.buffer = deque([], buffer_size)
+        self.stop_capture = False
+    def __enter__(self) -> Self:
+        self.capture = VideoCapture(self.capture_id)
+        if not self.capture.isOpened():
+            raise IOError("Unable to open device.")
+        self.capture_thread = Thread(target=self.start_capture)
+        self.capture_thread.start()
+        return self
+    def __exit__(self, *_):
+        self.stop_capture = True
+        self.capture_thread.join()
+        self.capture.release()
+    def start_capture(self):
+        while True:
+            if self.stop_capture:
+                break
+            _, frame = self.capture.read()
+            if len(self.buffer) == self.buffer_size:
+                self.buffer.popleft()
+            self.buffer.append(frame)
+    def is_capturing(self) -> bool:
+        print(f"Filling buffer: {len(self.buffer)}/{self.buffer_size}")
+        return len(self.buffer) == self.buffer_size

app/checkpoints/xaa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07c12b25f4ba3ce4e354282940ffa010b063206eb7c676a9a404c125a657de4d
+size 20971520

app/checkpoints/xab ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bcdb8eba8a143d5e0cf7509f66a0a560262d8b3117015e6d9b9449e2cd865ff4
+size 20971520

app/checkpoints/xac ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:57d719a7c1c3ce68484a3fa28557aba9b91e60ed6919a7b34ba217f37f4ac477
+size 20971520

app/checkpoints/xad ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d63027be73a90be23789000844df7344dd4cd353195ee61a8a28cfd6b7af6565
+size 20971520

app/checkpoints/xae ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5bf557f3e89ab18a875d7be9f94e0955f8c70117792260dcda6590e3c0462924
+size 20971520

app/checkpoints/xaf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:935619ea8ee2a43891643a6111479d35328a889b238fa13b760acd538d8c4f71
+size 9996102

app/config/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+class Config:
+    checkpoint_path = 'app/checkpoints/model.pth.tar'
+    vocabulary_path = "app/data/info.pkl"
+    number_of_classes = 2000
+    number_of_frames = 64
+    stride = 8
+    batch_size = 10
+    topk = 1

app/data/info.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a38d2b934b7bb3923b4ffad96789414a346cc4f38af26943077dc1fb66dc068
+size 2182947

app/models/__init__.py ADDED Viewed

	@@ -0,0 +1,444 @@

+from math import ceil
+from torch import cat
+from torch.nn import AvgPool3d, BatchNorm3d, Conv3d, Dropout, MaxPool3d, Module
+from torch.nn.functional import pad, relu
+class MaxPool3dSamePadding(MaxPool3d):
+    def compute_pad(self, dim, s):
+        if s % self.stride[dim] == 0:
+            return max(self.kernel_size[dim] - self.stride[dim], 0)
+        else:
+            return max(self.kernel_size[dim] - (s % self.stride[dim]), 0)
+    def forward(self, x):
+        # compute 'same' padding
+        _, _, t, h, w = x.size()
+        # print t,h,ms.shaw
+        # out_t = np.ceil(float(t) / float(self.stride[0]))
+        # out_h = np.ceil(float(h) / float(self.stride[1]))
+        # out_w = np.ceil(float(w) / float(self.stride[2]))
+        # print out_t, out_h, out_w
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        # print pad_t, pad_h, pad_w
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+        padding = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        x = pad(x, padding)
+        return super(MaxPool3dSamePadding, self).forward(x)
+class Unit3D(Module):
+    def __init__(
+        self,
+        in_channels,
+        output_channels,
+        kernel_shape=(1, 1, 1),
+        stride=(1, 1, 1),
+        padding=0,
+        activation_fn=relu,
+        use_batch_norm=True,
+        use_bias=False,
+        name="unit_3d",
+        num_domains=1,
+    ):
+        """Initializes Unit3D module."""
+        super(Unit3D, self).__init__()
+        self._output_channels = output_channels
+        self._kernel_shape = kernel_shape
+        self._stride = stride
+        self._use_batch_norm = use_batch_norm
+        self._num_domains = num_domains
+        self._activation_fn = activation_fn
+        self._use_bias = use_bias
+        self.name = name
+        self.padding = padding
+        self.conv3d = Conv3d(
+            in_channels=in_channels,
+            out_channels=self._output_channels,
+            kernel_size=self._kernel_shape,
+            stride=self._stride,
+            padding=0,  # we always want padding to be 0 here. We will dynamically pad based on input size in forward function
+            bias=self._use_bias,
+        )
+        if self._use_batch_norm:
+            if self._num_domains == 1:
+                self.bn = BatchNorm3d(self._output_channels, eps=0.001, momentum=0.01)
+    def compute_pad(self, dim, s):
+        if s % self._stride[dim] == 0:
+            return max(self._kernel_shape[dim] - self._stride[dim], 0)
+        else:
+            return max(self._kernel_shape[dim] - (s % self._stride[dim]), 0)
+    def forward(self, x):
+        # compute 'same' padding
+        _, _, t, h, w = x.size()
+        # print t,h,w
+        # out_t = np.ceil(float(t) / float(self._stride[0]))
+        # out_h = np.ceil(float(h) / float(self._stride[1]))
+        # out_w = np.ceil(float(w) / float(self._stride[2]))
+        # print out_t, out_h, out_w
+        pad_t = self.compute_pad(0, t)
+        pad_h = self.compute_pad(1, h)
+        pad_w = self.compute_pad(2, w)
+        # print pad_t, pad_h, pad_w
+        pad_t_f = pad_t // 2
+        pad_t_b = pad_t - pad_t_f
+        pad_h_f = pad_h // 2
+        pad_h_b = pad_h - pad_h_f
+        pad_w_f = pad_w // 2
+        pad_w_b = pad_w - pad_w_f
+        padding = (pad_w_f, pad_w_b, pad_h_f, pad_h_b, pad_t_f, pad_t_b)
+        # print x.size()
+        # print pad
+        x = pad(x, padding)
+        # print x.size()
+        x = self.conv3d(x)
+        if self._use_batch_norm:
+            x = self.bn(x)
+        if self._activation_fn is not None:
+            x = self._activation_fn(x)
+        return x
+class InceptionModule(Module):
+    def __init__(self, in_channels, out_channels, name, num_domains=1):
+        super(InceptionModule, self).__init__()
+        self.b0 = Unit3D(
+            in_channels=in_channels,
+            output_channels=out_channels[0],
+            kernel_shape=[1, 1, 1],
+            padding=0,
+            name=name + "/Branch_0/Conv3d_0a_1x1",
+        )
+        self.b1a = Unit3D(
+            in_channels=in_channels,
+            output_channels=out_channels[1],
+            kernel_shape=[1, 1, 1],
+            padding=0,
+            name=name + "/Branch_1/Conv3d_0a_1x1",
+        )
+        self.b1b = Unit3D(
+            in_channels=out_channels[1],
+            output_channels=out_channels[2],
+            kernel_shape=[3, 3, 3],
+            name=name + "/Branch_1/Conv3d_0b_3x3",
+        )
+        self.b2a = Unit3D(
+            in_channels=in_channels,
+            output_channels=out_channels[3],
+            kernel_shape=[1, 1, 1],
+            padding=0,
+            name=name + "/Branch_2/Conv3d_0a_1x1",
+        )
+        self.b2b = Unit3D(
+            in_channels=out_channels[3],
+            output_channels=out_channels[4],
+            kernel_shape=[3, 3, 3],
+            name=name + "/Branch_2/Conv3d_0b_3x3",
+        )
+        self.b3a = MaxPool3dSamePadding(
+            kernel_size=[3, 3, 3], stride=(1, 1, 1), padding=0
+        )
+        self.b3b = Unit3D(
+            in_channels=in_channels,
+            output_channels=out_channels[5],
+            kernel_shape=[1, 1, 1],
+            padding=0,
+            name=name + "/Branch_3/Conv3d_0b_1x1",
+        )
+        self.name = name
+    def forward(self, x):
+        b0 = self.b0(x)
+        b1 = self.b1b(self.b1a(x))
+        b2 = self.b2b(self.b2a(x))
+        b3 = self.b3b(self.b3a(x))
+        return cat([b0, b1, b2, b3], dim=1)
+class InceptionI3d(Module):
+    """Inception-v1 I3D architecture.
+    The model is introduced in:
+        Quo Vadis, Action Recognition? A New Model and the Kinetics Dataset
+        Joao Carreira, Andrew Zisserman
+        https://arxiv.org/pdf/1705.07750v1.pdf.
+    See also the Inception architecture, introduced in:
+        Going deeper with convolutions
+        Christian Szegedy, Wei Liu, Yangqing Jia, Pierre Sermanet, Scott Reed,
+        Dragomir Anguelov, Dumitru Erhan, Vincent Vanhoucke, Andrew Rabinovich.
+        http://arxiv.org/pdf/1409.4842v1.pdf.
+    """
+    # Endpoints of the model in order. During construction, all the endpoints up
+    # to a designated `final_endpoint` are returned in a dictionary as the
+    # second return value.
+    VALID_ENDPOINTS = (
+        "Conv3d_1a_7x7",
+        "MaxPool3d_2a_3x3",
+        "Conv3d_2b_1x1",
+        "Conv3d_2c_3x3",
+        "MaxPool3d_3a_3x3",
+        "Mixed_3b",
+        "Mixed_3c",
+        "MaxPool3d_4a_3x3",
+        "Mixed_4b",
+        "Mixed_4c",
+        "Mixed_4d",
+        "Mixed_4e",
+        "Mixed_4f",
+        "MaxPool3d_5a_2x2",
+        "Mixed_5b",
+        "Mixed_5c",
+        "Logits",
+        "Predictions",
+    )
+    def __init__(
+        self,
+        num_classes=400,
+        spatiotemporal_squeeze=True,
+        final_endpoint="Logits",
+        name="inception_i3d",
+        in_channels=3,
+        dropout_keep_prob=0.5,
+        num_in_frames=64,
+        include_embds=False,
+    ):
+        """Initializes I3D model instance.
+        Args:
+          num_classes: The number of outputs in the logit layer (default 400, which
+              matches the Kinetics dataset).
+          spatiotemporal_squeeze: Whether to squeeze the 2 spatial and 1 temporal dimensions for the logits
+              before returning (default True).
+          final_endpoint: The model contains many possible endpoints.
+              `final_endpoint` specifies the last endpoint for the model to be built
+              up to. In addition to the output at `final_endpoint`, all the outputs
+              at endpoints up to `final_endpoint` will also be returned, in a
+              dictionary. `final_endpoint` must be one of
+              InceptionI3d.VALID_ENDPOINTS (default 'Logits').
+          in_channels: Number of input channels (default 3 for RGB).
+          dropout_keep_prob: Dropout probability (default 0.5).
+          name: A string (optional). The name of this module.
+          num_in_frames: Number of input frames (default 64).
+          include_embds: Whether to return embeddings (default False).
+        Raises:
+          ValueError: if `final_endpoint` is not recognized.
+        """
+        if final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError("Unknown final endpoint %s" % final_endpoint)
+        super().__init__()
+        self._num_classes = num_classes
+        self._spatiotemporal_squeeze = spatiotemporal_squeeze
+        self._final_endpoint = final_endpoint
+        self.include_embds = include_embds
+        self.logits = None
+        if self._final_endpoint not in self.VALID_ENDPOINTS:
+            raise ValueError("Unknown final endpoint %s" % self._final_endpoint)
+        self.end_points = {}
+        end_point = "Conv3d_1a_7x7"
+        self.end_points[end_point] = Unit3D(
+            in_channels=in_channels,
+            output_channels=64,
+            kernel_shape=[7, 7, 7],
+            stride=(2, 2, 2),
+            padding=(3, 3, 3),
+            name=name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "MaxPool3d_2a_3x3"
+        self.end_points[end_point] = MaxPool3dSamePadding(
+            kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Conv3d_2b_1x1"
+        self.end_points[end_point] = Unit3D(
+            in_channels=64,
+            output_channels=64,
+            kernel_shape=[1, 1, 1],
+            padding=0,
+            name=name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Conv3d_2c_3x3"
+        self.end_points[end_point] = Unit3D(
+            in_channels=64,
+            output_channels=192,
+            kernel_shape=[3, 3, 3],
+            padding=1,
+            name=name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "MaxPool3d_3a_3x3"
+        self.end_points[end_point] = MaxPool3dSamePadding(
+            kernel_size=[1, 3, 3], stride=(1, 2, 2), padding=0
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Mixed_3b"
+        self.end_points[end_point] = InceptionModule(
+            192, [64, 96, 128, 16, 32, 32], name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Mixed_3c"
+        self.end_points[end_point] = InceptionModule(
+            256, [128, 128, 192, 32, 96, 64], name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "MaxPool3d_4a_3x3"
+        self.end_points[end_point] = MaxPool3dSamePadding(
+            kernel_size=[3, 3, 3], stride=(2, 2, 2), padding=0
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Mixed_4b"
+        self.end_points[end_point] = InceptionModule(
+            128 + 192 + 96 + 64, [192, 96, 208, 16, 48, 64], name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Mixed_4c"
+        self.end_points[end_point] = InceptionModule(
+            192 + 208 + 48 + 64, [160, 112, 224, 24, 64, 64], name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Mixed_4d"
+        self.end_points[end_point] = InceptionModule(
+            160 + 224 + 64 + 64, [128, 128, 256, 24, 64, 64], name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Mixed_4e"
+        self.end_points[end_point] = InceptionModule(
+            128 + 256 + 64 + 64, [112, 144, 288, 32, 64, 64], name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Mixed_4f"
+        self.end_points[end_point] = InceptionModule(
+            112 + 288 + 64 + 64, [256, 160, 320, 32, 128, 128], name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "MaxPool3d_5a_2x2"
+        self.end_points[end_point] = MaxPool3dSamePadding(
+            kernel_size=[2, 2, 2], stride=(2, 2, 2), padding=0
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Mixed_5b"
+        self.end_points[end_point] = InceptionModule(
+            256 + 320 + 128 + 128, [256, 160, 320, 32, 128, 128], name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Mixed_5c"
+        self.end_points[end_point] = InceptionModule(
+            256 + 320 + 128 + 128, [384, 192, 384, 48, 128, 128], name + end_point,
+        )
+        if self._final_endpoint == end_point:
+            return
+        end_point = "Logits"
+        last_duration = int(ceil(num_in_frames / 8))  # 8
+        last_size = 7  # int(ceil(sample_width / 32))  # this is for 224
+        self.avgpool = AvgPool3d((last_duration, last_size, last_size), stride=1)
+        self.dropout = Dropout(dropout_keep_prob)
+        self.logits = Unit3D(
+            in_channels=384 + 384 + 128 + 128,
+            output_channels=self._num_classes,
+            kernel_shape=[1, 1, 1],
+            padding=0,
+            activation_fn=None,
+            use_batch_norm=False,
+            use_bias=True,
+            name="logits",
+        )
+        self.build()
+    def replace_logits(self, num_classes):
+        self._num_classes = num_classes
+        self.logits = Unit3D(
+            in_channels=384 + 384 + 128 + 128,
+            output_channels=self._num_classes,
+            kernel_shape=[1, 1, 1],
+            padding=0,
+            activation_fn=None,
+            use_batch_norm=False,
+            use_bias=True,
+            name="logits",
+        )
+    def build(self):
+        for k in self.end_points.keys():
+            self.add_module(k, self.end_points[k])
+    def forward(self, x):
+        for end_point in self.VALID_ENDPOINTS:
+            if end_point in self.end_points:
+                x = self._modules[end_point](x)
+        # [batch x featuredim x 1 x 1 x 1]
+        embds = self.dropout(self.avgpool(x))
+        # [batch x classes x 1 x 1 x 1]
+        x = self.logits(embds)
+        if self._spatiotemporal_squeeze:
+            # [batch x classes]
+            logits = x.squeeze(3).squeeze(3).squeeze(2)
+        # logits [batch X classes]
+        if self.include_embds:
+            return {"logits": logits, "embds": embds}
+        else:
+            return {"logits": logits}

app/server/__init__.py ADDED Viewed

	@@ -0,0 +1,50 @@

+from http.server import BaseHTTPRequestHandler, HTTPServer
+from threading import Thread
+from app.translator import Translator
+class Server(BaseHTTPRequestHandler):
+    def __init__(self, translator: Translator):
+        self.translator = translator
+    def __call__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def do_GET(self):
+        self.send_response(200)
+        self.send_header('Access-Control-Allow-Origin', '*')
+        self.end_headers()
+        self.wfile.write(self.translator.result.encode('utf-8'))
+class HTTPDaemon:
+    def __init__(self, host: str, port: int, translator: Translator):
+        self.host = host
+        self.port = port
+        self.httpd = HTTPServer((self.host, self.port), Server(translator))
+        self.server_thread: Thread
+    def __enter__(self):
+        print(f"Serving HTTP on {self.host} port {self.port} (http://{self.host}:{self.port}/)..")
+        self.server_thread = Thread(target=self.httpd.serve_forever)
+        self.server_thread.start()
+    def __exit__(self, *_):
+        print("\nServer closing..")
+        self.httpd.shutdown()
+        self.httpd.server_close()
+        self.server_thread.join()

app/translator/__init__.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import math
+import pickle
+from collections import deque
+from os.path import exists
+from subprocess import call, DEVNULL
+import cv2 as cv
+import numpy as np
+import scipy
+import torch
+from torch.nn import DataParallel, Module
+from app.config import Config
+from app.models import InceptionI3d
+class Translator:
+    def __init__(self, confidence: float):
+        self.confidence = confidence
+        self.model = self.load_model(Config.checkpoint_path, Config.number_of_classes, Config.number_of_frames)
+        self.word_data = self.load_vocabulary(Config.vocabulary_path)
+        self.result = ""
+    def resize_generic(self, img, oheight, owidth, interp="bilinear", is_flow=False):
+        """
+        Args
+        inp: numpy array: RGB image (H, W, 3) | video with 3*nframes (H, W, 3*nframes)
+            |  single channel image (H, W, 1) | -- not supported:  video with (nframes, 3, H, W)
+        """
+        # resized_image = cv.resize(image, (100, 50))
+        ht = img.shape[0]
+        chn = img.shape[2]
+        if chn == 1:
+            resized_img = scipy.misc.imresize(
+                img.squeeze(), [oheight, owidth], interp=interp, mode="F"
+            ).reshape((oheight, owidth, chn))
+        elif chn == 3:
+            # resized_img = scipy.misc.imresize(img, [oheight, owidth], interp=interp)  # mode='F' gives an error for 3 channels
+            resized_img = cv.resize(img, (owidth, oheight))  # inverted compared to scipy
+        elif chn == 2:
+            # assert(is_flow)
+            resized_img = np.zeros((oheight, owidth, chn), dtype=img.dtype)
+            for t in range(chn):
+                # resized_img[:, :, t] = scipy.misc.imresize(img[:, :, t], [oheight, owidth], interp=interp)
+                # resized_img[:, :, t] = scipy.misc.imresize(img[:, :, t], [oheight, owidth], interp=interp, mode='F')
+                # resized_img[:, :, t] = np.array(Image.fromarray(img[:, :, t]).resize([oheight, owidth]))
+                resized_img[:, :, t] = scipy.ndimage.interpolation.zoom(
+                    img[:, :, t], [oheight, owidth]
+                )
+        else:
+            in_chn = 3
+            # Workaround, would be better to pass #frames
+            if chn == 16:
+                in_chn = 1
+            if chn == 32:
+                in_chn = 2
+            nframes = int(chn / in_chn)
+            img = img.reshape(img.shape[0], img.shape[1], in_chn, nframes)
+            resized_img = np.zeros((oheight, owidth, in_chn, nframes), dtype=img.dtype)
+            for t in range(nframes):
+                frame = img[:, :, :, t]  # img[:, :, t*3:t*3+3]
+                frame = cv.resize(frame, (owidth, oheight)).reshape(
+                    oheight, owidth, in_chn
+                )
+                # frame = scipy.misc.imresize(frame, [oheight, owidth], interp=interp)
+                resized_img[:, :, :, t] = frame
+            resized_img = resized_img.reshape(
+                resized_img.shape[0], resized_img.shape[1], chn
+            )
+        if is_flow:
+            # print(oheight / ht)
+            # print(owidth / wd)
+            resized_img = resized_img * oheight / ht
+        return resized_img
+    def color_normalize(self, x, mean, std):
+        """Normalize a tensor of images by subtracting (resp. dividing) by the mean (resp.
+        std. deviation) statistics of a dataset in RGB space.
+        """
+        if x.dim() in {3, 4}:
+            if x.size(0) == 1:
+                x = x.repeat(3, 1, 1)
+            assert x.size(0) == 3, "For single video format, expected RGB along first dim"
+            for t, m, s in zip(x, mean, std):
+                t.sub_(m)
+                t.div_(s)
+        elif x.dim() == 5:
+            assert (
+                x.shape[1] == 3
+            ), "For batched video format, expected RGB along second dim"
+            x[:, 0].sub_(mean[0]).div_(std[0])
+            x[:, 1].sub_(mean[1]).div_(std[1])
+            x[:, 2].sub_(mean[2]).div_(std[2])
+        return x
+    def to_torch(self, ndarray):
+        if type(ndarray).__module__ == "numpy":
+            return torch.from_numpy(ndarray)
+        elif not torch.is_tensor(ndarray):
+            raise ValueError(f"Cannot convert {type(ndarray)} to torch tensor")
+        return ndarray
+    def to_numpy(self, tensor):
+        if torch.is_tensor(tensor):
+            return tensor.cpu().numpy()
+        elif type(tensor).__module__ != "numpy":
+            raise ValueError(f"Cannot convert {type(tensor)} to numpy array")
+        return tensor
+    def im_to_numpy(self, img):
+        img = self.to_numpy(img)
+        img = np.transpose(img, (1, 2, 0))  # H*W*C
+        return img
+    def im_to_torch(self, img):
+        img = np.transpose(img, (2, 0, 1))  # C*H*W
+        img = self.to_torch(img).float()
+        return img / 255 if img.max() > 1 else img
+    def load_model(self, checkpoint_path: str, number_of_classes: int, number_of_frames: int) -> Module:
+        model = DataParallel(InceptionI3d(
+            number_of_classes,
+            spatiotemporal_squeeze=True,
+            final_endpoint='Logits',
+            name="inception_i3d",
+            in_channels=3,
+            dropout_keep_prob=0.5,
+            num_in_frames=number_of_frames
+        )).cuda()
+        if not exists(Config.checkpoint_path):
+            call(f'cat app/checkpoints/* >> {Config.checkpoint_path}', shell=True, stdout=DEVNULL)
+        checkpoint = torch.load(checkpoint_path)
+        model.load_state_dict(checkpoint['state_dict'])
+        model.eval()
+        return model
+    def load_vocabulary(self, vocabulary_path: str) -> dict:
+        with open(vocabulary_path, 'rb') as file:
+            return pickle.load(file)
+    def prepare_input(self, video: deque, input_resolution: int=224, resize_resolution: int=256, mean: torch.Tensor=0.5*torch.ones(3), std: torch.Tensor=1.0*torch.ones(3)) -> np.ndarray:
+        video_tensor = torch.stack(
+            [self.im_to_torch(frame[:, :, [2, 1, 0]]) for frame in video]
+        ).permute(1, 0, 2, 3)
+        iC, iF, _, _ = video_tensor.shape
+        video_tensor_resized = np.zeros((iF, resize_resolution, resize_resolution, iC))
+        for t in range(iF):
+            tmp = video_tensor[:, t, :, :]
+            tmp = self.resize_generic(
+                self.im_to_numpy(tmp), resize_resolution, resize_resolution, interp="bilinear", is_flow=False
+            )
+            video_tensor_resized[t] = tmp
+        video_tensor_resized = np.transpose(video_tensor_resized, (3, 0, 1, 2))
+        # Center crop coords
+        ulx = int((resize_resolution - input_resolution) / 2)
+        uly = int((resize_resolution - input_resolution) / 2)
+        # Crop 256x256
+        video_tensor_resized = video_tensor_resized[:, :, uly : uly + input_resolution, ulx : ulx + input_resolution]
+        video_tensor_resized = self.to_torch(video_tensor_resized).float()
+        assert video_tensor_resized.max() <= 1
+        video_tensor_resized = self.color_normalize(video_tensor_resized, mean, std)
+        return video_tensor_resized
+    def sliding_windows(self, input_video: torch.Tensor, number_of_frames: int, stride: int) -> torch.Tensor:
+        """
+        Return sliding windows and corresponding (middle) timestamp
+        """
+        C, nFrames, H, W = input_video.shape
+        # If needed, pad to the minimum clip length
+        if nFrames < number_of_frames:
+            rgb_ = torch.zeros(C, number_of_frames, H, W)
+            rgb_[:, :nFrames] = input_video
+            rgb_[:, nFrames:] = input_video[:, -1].unsqueeze(1)
+            input_video = rgb_
+            nFrames = input_video.shape[1]
+        num_clips = math.ceil((nFrames - number_of_frames) / stride) + 1
+        rgb_slided = torch.zeros(num_clips, 3, number_of_frames, H, W)
+        # For each clip
+        for j in range(num_clips):
+            # Check if num_clips becomes 0
+            stride_j = j * stride
+            actual_clip_length = min(number_of_frames, nFrames - stride_j)
+            t_beg = stride_j if actual_clip_length == number_of_frames else nFrames - number_of_frames
+            rgb_slided[j] = input_video[:, t_beg : t_beg + number_of_frames, :, :]
+        return rgb_slided
+    def video_to_asl(self, video: deque):
+        input_video = self.prepare_input(video)
+        input_sliding_window = self.sliding_windows(input_video, Config.number_of_frames, Config.stride)
+        num_clips = input_sliding_window.shape[0]
+        # Group the clips into batches
+        num_batches = math.ceil(num_clips / Config.batch_size)
+        raw_scores = np.empty((0, Config.number_of_classes), dtype=float)
+        for b in range(num_batches):
+            inp = input_sliding_window[b * Config.batch_size : (b + 1) * Config.batch_size]
+            # Forward pass
+            out = self.model(inp)
+            raw_scores = np.append(raw_scores, out["logits"].cpu().detach().numpy(), axis=0)
+        prob_scores = scipy.special.softmax(raw_scores, axis=1)
+        prob_sorted = np.sort(prob_scores, axis=1)[:, ::-1]
+        pred_sorted = np.argsort(prob_scores, axis=1)[:, ::-1]
+        word_topk = np.empty((Config.topk, num_clips), dtype=object)
+        for k in range(Config.topk):
+            for i, p in enumerate(pred_sorted[:, k]):
+                word_topk[k, i] = self.word_data["words"][p]
+        prob_topk = prob_sorted[:, :Config.topk].transpose()
+        # print(np.array([word_topk, prob_topk]).transpose())
+        self.result = "" if prob_topk[0, 0] <= self.confidence else word_topk[0, 0]

main.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from app import init_server
+if __name__ == '__main__':
+    init_server()

public/index.css ADDED Viewed

	@@ -0,0 +1,20 @@

+span {
+    line-height: 1.8;
+    word-wrap: normal;
+    display: inline-block;
+    padding-right: 0.3em;
+}
+#result {
+    position: fixed;
+    left: 50%;
+    top: 50%;
+    transform: translate(-50%, -50%);
+}
+.word {
+    text-align: center;
+    opacity: 1;
+    font-family: 'Roboto Mono', monospace;
+    font-size: xx-large;
+}

public/index.html ADDED Viewed

	@@ -0,0 +1,67 @@

+<!DOCTYPE html>
+<meta name="viewport" content="width=device-width, initial-scale=1" charset="UTF-8" />
+<html lang="en">
+  <head>
+    <title>Real-time ASL Translator</title>
+    <link rel="preconnect" href="https://fonts.googleapis.com" />
+    <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin />
+    <link href="https://fonts.googleapis.com/css2?family=Roboto+Mono:wght@500&display=swap" rel="stylesheet" />
+    <link rel="stylesheet" href="index.css" />
+    <script src="https://cdnjs.cloudflare.com/ajax/libs/gsap/3.6.1/gsap.min.js"></script>
+    <div id="result"></div>
+    <script type="text/javascript" charset="utf-8">
+      const result_element = document.getElementById('result')
+      let previous_result = ''
+      result_element.style.opacity = '1'
+      const timestep = 1 / 60
+      let opacity_progress = 1
+      const ease_in_expo = (progress_value) =>
+        progress_value == 0.0 ? 0.0 : Math.pow(2.0, 10.0 * progress_value - 10.0)
+      setInterval(() => {
+        opacity_progress -= 0.0025
+        result_element.style.opacity = String(ease_in_expo(opacity_progress))
+        if (Number(result_element.style.opacity) <= 0.05) {
+          result_element.replaceChildren()
+        }
+      }, timestep * 1000)
+      setInterval(async () => {
+        const result = await fetch('http://localhost:5000/')
+        if (result.status !== 200) return
+        const result_text = await result.text()
+        if (result_text === '') return
+        if (result_text !== previous_result) {
+          previous_result = result_text
+          const word = document.createElement('span')
+          word.className = 'word'
+          word.innerHTML = result_text
+          result_element.appendChild(word)
+          opacity_progress = 1
+          gsap.fromTo(
+            word,
+            {
+              autoAlpha: 0,
+              filter: 'blur(10px)',
+              y: 40
+            },
+            {
+              autoAlpha: 1,
+              filter: 'blur(0px)',
+              y: 0,
+              ease: 'Expo.easeOut',
+              duration: 0.5
+            }
+          )
+        }
+      }, 250)
+    </script>
+  </head>
+</html>

requirements.txt CHANGED Viewed

@@ -1,9 +1,14 @@
-flask==2.2.2
-Werkzeug==2.2.2
-protobuf==3.20.*
-flask-cors==3.0.10
-matplotlib==3.5.2
-numpy==1.22.3
-opencv-python-headless==4.5.5.64
-tensorflow==2.8.0
-uvicorn

+opencv >= 4.6.0
+pympi-ling
+intervaltree
+zsvision
+mergedeep
+humanize
+mock
+tqdm
+tensorboard
+scipy
+pillow
+scikit-learn
+frozendict
+numpy

resources/demo.gif ADDED Viewed

Git LFS Details

SHA256: fa85c217b43a58aac06841df3e7dc8afc479c5a82a6807c6b7c3b44377ad3630
Pointer size: 132 Bytes
Size of remote file: 1.7 MB