ferrazzipietro commited on
Commit
c1efb72
·
1 Parent(s): 6265e38
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Dockerfile +133 -0
  2. README.md +69 -5
  3. docker-compose.dev.yaml +0 -0
  4. seamless_server/.DS_Store +0 -0
  5. seamless_server/.gitignore +4 -0
  6. seamless_server/app_pubsub.py +829 -0
  7. seamless_server/models/.DS_Store +0 -0
  8. seamless_server/models/Seamless/vad_s2st_sc_24khz_main.yaml +25 -0
  9. seamless_server/models/SeamlessStreaming/vad_s2st_sc_main.yaml +21 -0
  10. seamless_server/requirements.txt +29 -0
  11. seamless_server/run_docker.sh +5 -0
  12. seamless_server/src/room.py +64 -0
  13. seamless_server/src/simuleval_agent_directory.py +171 -0
  14. seamless_server/src/simuleval_transcoder.py +428 -0
  15. seamless_server/src/speech_and_text_output.py +15 -0
  16. seamless_server/src/transcoder_helpers.py +43 -0
  17. streaming-react-app/.eslintrc.cjs +18 -0
  18. streaming-react-app/.gitignore +24 -0
  19. streaming-react-app/README.md +14 -0
  20. streaming-react-app/index.html +13 -0
  21. streaming-react-app/package-lock.json +0 -0
  22. streaming-react-app/package.json +53 -0
  23. streaming-react-app/src/App.tsx +57 -0
  24. streaming-react-app/src/Blink.tsx +41 -0
  25. streaming-react-app/src/DebugSection.tsx +62 -0
  26. streaming-react-app/src/RoomConfig.tsx +263 -0
  27. streaming-react-app/src/SocketWrapper.tsx +218 -0
  28. streaming-react-app/src/StreamingInterface.css +56 -0
  29. streaming-react-app/src/StreamingInterface.tsx +1221 -0
  30. streaming-react-app/src/URLParams.ts +50 -0
  31. streaming-react-app/src/assets/Roboto-msdf.json +0 -0
  32. streaming-react-app/src/assets/Roboto-msdf.png +0 -0
  33. streaming-react-app/src/assets/RobotoMono-Regular-msdf.json +0 -0
  34. streaming-react-app/src/assets/RobotoMono-Regular.png +0 -0
  35. streaming-react-app/src/assets/illy.svg +990 -0
  36. streaming-react-app/src/assets/illy2.svg +990 -0
  37. streaming-react-app/src/createBufferedSpeechPlayer.ts +173 -0
  38. streaming-react-app/src/cursorBlinkInterval.ts +1 -0
  39. streaming-react-app/src/debug.ts +257 -0
  40. streaming-react-app/src/float32To16BitPCM.ts +16 -0
  41. streaming-react-app/src/generateNewRoomID.ts +56 -0
  42. streaming-react-app/src/getParamFlag.ts +39 -0
  43. streaming-react-app/src/getTranslationSentencesFromReceivedData.ts +22 -0
  44. streaming-react-app/src/isScrolledToDocumentBottom.ts +11 -0
  45. streaming-react-app/src/languageLookup.ts +119 -0
  46. streaming-react-app/src/main.tsx +9 -0
  47. streaming-react-app/src/react-xr/ARButton.tsx +89 -0
  48. streaming-react-app/src/react-xr/Button.tsx +117 -0
  49. streaming-react-app/src/react-xr/Colors.ts +6 -0
  50. streaming-react-app/src/react-xr/MovementController.tsx +64 -0
Dockerfile ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build frontend with node
2
+ FROM node:20-alpine AS frontend
3
+ RUN apk add --no-cache libc6-compat
4
+ WORKDIR /app
5
+
6
+ ARG DUMMY_ARG=1
7
+
8
+ COPY streaming-react-app .
9
+ RUN \
10
+ if [ -f yarn.lock ]; then yarn --frozen-lockfile; \
11
+ elif [ -f package-lock.json ]; then npm ci; \
12
+ elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i --frozen-lockfile; \
13
+ else echo "Lockfile not found." && exit 1; \
14
+ fi
15
+
16
+ RUN npm run build
17
+
18
+ # build backend on CUDA
19
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS backend
20
+ WORKDIR /app
21
+
22
+ ENV DEBIAN_FRONTEND=noninteractive
23
+ ENV NODE_MAJOR=20
24
+
25
+ RUN apt-get update && \
26
+ apt-get upgrade -y && \
27
+ apt-get install -y --no-install-recommends \
28
+ git \
29
+ git-lfs \
30
+ wget \
31
+ curl \
32
+ # python build dependencies \
33
+ build-essential \
34
+ libssl-dev \
35
+ zlib1g-dev \
36
+ libbz2-dev \
37
+ libreadline-dev \
38
+ libsqlite3-dev \
39
+ libncursesw5-dev \
40
+ xz-utils \
41
+ tk-dev \
42
+ libxml2-dev \
43
+ libxmlsec1-dev \
44
+ libffi-dev \
45
+ liblzma-dev \
46
+ sox libsox-fmt-all \
47
+ # gradio dependencies \
48
+ ffmpeg \
49
+ # fairseq2 dependencies \
50
+ libsndfile-dev && \
51
+ apt-get clean && \
52
+ rm -rf /var/lib/apt/lists/*
53
+
54
+
55
+ # RUN apt-get install zlib1g-dev &&\
56
+ # wget https://sourceforge.net/projects/libpng/files/libpng15/1.5.30/libpng-1.5.30.tar.gz &&\
57
+ # tar -xzvf libpng-1.5.30.tar.gz &&\
58
+ # cd libpng-1.5.30 &&\
59
+ # ./configure --prefix=/usr/local/libpng &&\
60
+ # make &&\
61
+ # make install &&\
62
+ # echo "Contents of /usr/local/lib:" && ls /usr/local/lib && \
63
+ # echo "Contents of /usr/local/libpng/lib:" && ls /usr/local/libpng/lib && \
64
+ # # ls /usr/local/lib/libpng* &&\
65
+ # ldconfig &&\
66
+ # cd ..
67
+ # ENV LD_LIBRARY_PATH=/usr/local/libpng/lib:
68
+
69
+ # RUN wget http://www.ijg.org/files/jpegsrc.v9a.tar.gz &&\
70
+ # tar -xzvf jpegsrc.v9a.tar.gz &&\
71
+ # cd jpeg-9a &&\
72
+ # ./configure --prefix=/usr/local/libjpeg &&\
73
+ # make &&\
74
+ # make install
75
+ # RUN cd .. &&\
76
+ # echo "Contents of /usr/local/lib :" && ls /usr/local && \
77
+ # echo "Contents of /usr/local/libjpeg/lib :" && ls /usr/local/libjpeg/lib && \
78
+ # ls /usr/local/lib/libpng* &&\
79
+ # ldconfig
80
+ # # ENV LD_LIBRARY_PATH=/usr/local/libpng/lib:
81
+
82
+ # RUN wget libjpeg62-turbo_2.0.6-4_amd64.deb &&\
83
+ # tar -xzvf jpegsrc.v9a.tar.gz &&\
84
+ # cd jpeg-9a &&\
85
+ # ./configure --prefix=/usr/local/libjpeg &&\
86
+ # make &&\
87
+ # make install
88
+ # RUN cd .. &&\
89
+ # echo "Contents of /usr/local/lib :" && ls /usr/local && \
90
+ # echo "Contents of /usr/local/libjpeg/lib :" && ls /usr/local/libjpeg/lib && \
91
+ # ls /usr/local/lib/libpng* &&\
92
+ # ldconfig
93
+ # ENV LD_LIBRARY_PATH=/usr/local/libpng/lib:
94
+
95
+ # RUN apt-get update &&\
96
+ # apt-get -y install libjpeg62-turbo-dev &&\
97
+ # apt-get install libjpeg8 libbodfile1
98
+
99
+ RUN useradd -m -u 1000 user
100
+ USER user
101
+ ENV HOME=/home/user \
102
+ PATH=/home/user/.local/bin:$PATH
103
+ WORKDIR $HOME/app
104
+
105
+ RUN curl https://pyenv.run | bash
106
+ ENV PATH=$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH
107
+ ARG PYTHON_VERSION=3.10.12
108
+ RUN pyenv install $PYTHON_VERSION && \
109
+ pyenv global $PYTHON_VERSION && \
110
+ pyenv rehash && \
111
+ pip install --no-cache-dir -U pip setuptools wheel
112
+
113
+ COPY --chown=user:user ./seamless_server ./seamless_server
114
+ # change dir since pip needs to seed whl folder
115
+ RUN cd seamless_server && \
116
+ pip install fairseq2 &&\
117
+ # pip install fairseq2 --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.1.1/cu118 && \
118
+ pip install --no-cache-dir --upgrade -r requirements.txt
119
+ COPY --from=frontend /app/dist ./streaming-react-app/dist
120
+
121
+ WORKDIR $HOME/app/seamless_server
122
+ RUN --mount=type=secret,id=HF_TOKEN,mode=0444,required=false \
123
+ huggingface-cli login --token $(cat /run/secrets/HF_TOKEN) || echo "HF_TOKEN error" && \
124
+ huggingface-cli download meta-private/SeamlessExpressive pretssel_melhifigan_wm-final.pt --local-dir ./models/Seamless/ || echo "HF_TOKEN error" && \
125
+ ln -s $(readlink -f models/Seamless/pretssel_melhifigan_wm-final.pt) models/Seamless/pretssel_melhifigan_wm.pt || true;
126
+
127
+ USER root
128
+ RUN ln -s /usr/lib/x86_64-linux-gnu/libsox.so.3 /usr/lib/x86_64-linux-gnu/libsox.so
129
+ USER user
130
+ RUN ["chmod", "+x", "./run_docker.sh"]
131
+ CMD ./run_docker.sh
132
+
133
+
README.md CHANGED
@@ -1,10 +1,74 @@
1
  ---
2
- title: Stream Trans Docker
3
- emoji: 📊
4
- colorFrom: purple
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
 
 
 
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Seamless Streaming
3
+ emoji: 📞
4
+ colorFrom: blue
5
+ colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
+ suggested_hardware: t4-small
9
+ models:
10
+ - facebook/seamless-streaming
11
  ---
12
 
13
+ # RESOURCES AWS
14
+ I use a g4 machine with 35GB of volume. You could delete the volume and create a new one and connect it to the instance when needed. Obv you'd need to re-pull the docker image.
15
+ The same is true for the elastic IP. Both the volume and the elastic IP costs me ~0.10$ per day, for an overall of ~0.22$. It does not make sense to keep them if you don't need to use it for a couple of months.
16
+
17
+ # Seamless Streaming demo
18
+ ## Running on HF spaces
19
+ You can simply duplicate the space to run it.
20
+
21
+ ## Running locally
22
+ ### Install backend seamless_server dependencies
23
+
24
+ > [!NOTE]
25
+ > Please note: we *do not* recommend running the model on CPU. CPU inference will be slow and introduce noticable delays in the simultaneous translation.
26
+
27
+ > [!NOTE]
28
+ > The example below is for PyTorch stable (2.1.1) and variant cu118.
29
+ > Check [here](https://pytorch.org/get-started/locally/) to find the torch/torchaudio command for your variant.
30
+ > Check [here](https://github.com/facebookresearch/fairseq2#variants) to find the fairseq2 command for your variant.
31
+
32
+ If running for the first time, create conda environment and install the desired torch version. Then install the rest of the requirements:
33
+ ```
34
+ cd seamless_server
35
+ conda create --yes --name smlss_server python=3.8 libsndfile==1.0.31
36
+ conda activate smlss_server
37
+ conda install --yes pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
38
+ pip install fairseq2 --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/whl/nightly/pt2.1.1/cu118
39
+ pip install -r requirements.txt
40
+ ```
41
+
42
+ ### Install frontend streaming-react-app dependencies
43
+ ```
44
+ conda install -c conda-forge nodejs
45
+ cd streaming-react-app
46
+ npm install --global yarn
47
+ yarn
48
+ yarn build # this will create the dist/ folder
49
+ ```
50
+
51
+
52
+ ### Running the server
53
+
54
+ The server can be run locally with uvicorn below.
55
+ Run the server in dev mode:
56
+
57
+ ```
58
+ cd seamless_server
59
+ uvicorn app_pubsub:app --reload --host localhost
60
+ ```
61
+
62
+ Run the server in prod mode:
63
+
64
+ ```
65
+ cd seamless_server
66
+ uvicorn app_pubsub:app --host 0.0.0.0
67
+ ```
68
+
69
+ To enable additional logging from uvicorn pass `--log-level debug` or `--log-level trace`.
70
+
71
+
72
+ ### Debuging
73
+
74
+ If you enable "Server Debug Flag" when starting streaming from the client, this enables extensive debug logging and it saves audio files in /debug folder.
docker-compose.dev.yaml ADDED
File without changes
seamless_server/.DS_Store ADDED
Binary file (6.15 kB). View file
 
seamless_server/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__/
2
+ src/__pycache__/
3
+ debug/
4
+ .vscode/
seamless_server/app_pubsub.py ADDED
@@ -0,0 +1,829 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+ import os
3
+ from typing import Any, Optional, Tuple, Dict, TypedDict
4
+ from urllib import parse
5
+ from uuid import uuid4
6
+ import colorlog
7
+ import io
8
+ import logging
9
+ from pprint import pformat
10
+ import socketio
11
+ import sys
12
+ import time
13
+ import random
14
+ import string
15
+ from starlette.applications import Starlette
16
+ from starlette.routing import Mount, Route
17
+ from starlette.staticfiles import StaticFiles
18
+
19
+
20
+ from src.room import Room, Member
21
+ from src.simuleval_agent_directory import NoAvailableAgentException
22
+ from src.simuleval_agent_directory import SimulevalAgentDirectory
23
+ from src.simuleval_transcoder import SimulevalTranscoder
24
+ from src.transcoder_helpers import get_transcoder_output_events
25
+
26
+ ###############################################
27
+ # Constants
28
+ ###############################################
29
+
30
+ DEBUG = True
31
+
32
+ ALL_ROOM_ID = "ALL"
33
+
34
+ ROOM_ID_USABLE_CHARACTERS = string.ascii_uppercase
35
+ ROOM_ID_LENGTH = 4
36
+
37
+ ROOM_LISTENERS_SUFFIX = "_listeners"
38
+ ROOM_SPEAKERS_SUFFIX = "_speakers"
39
+
40
+ ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME = "remove_server_lock"
41
+
42
+ ###############################################
43
+ # Configure logger
44
+ ###############################################
45
+
46
+ logger = logging.getLogger("socketio_server_pubsub")
47
+ logger.propagate = False
48
+
49
+ handler = colorlog.StreamHandler(stream=sys.stdout)
50
+
51
+ formatter = colorlog.ColoredFormatter(
52
+ "%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
53
+ reset=True,
54
+ log_colors={
55
+ "DEBUG": "cyan",
56
+ "INFO": "green",
57
+ "WARNING": "yellow",
58
+ "ERROR": "red",
59
+ "CRITICAL": "red,bg_white",
60
+ },
61
+ )
62
+
63
+ handler.setFormatter(formatter)
64
+ logger.addHandler(handler)
65
+
66
+ logger.setLevel(logging.WARNING)
67
+
68
+ print("")
69
+ print("")
70
+ print("=" * 20 + " ⭐️ Starting Server... ⭐️ " + "=" * 20)
71
+
72
+ ###############################################
73
+ # Configure socketio server
74
+ ###############################################
75
+
76
+ CLIENT_BUILD_PATH = "../streaming-react-app/dist/"
77
+ static_files = {
78
+ "/": CLIENT_BUILD_PATH,
79
+ "/assets/seamless-db6a2555.svg": {
80
+ "filename": CLIENT_BUILD_PATH + "assets/seamless-db6a2555.svg",
81
+ "content_type": "image/svg+xml",
82
+ },
83
+ }
84
+
85
+ # sio is the main socket.io entrypoint
86
+ sio = socketio.AsyncServer(
87
+ async_mode="asgi",
88
+ cors_allowed_origins="*",
89
+ logger=logger,
90
+ # engineio_logger=logger,
91
+ )
92
+ # sio.logger.setLevel(logging.DEBUG)
93
+ socketio_app = socketio.ASGIApp(sio)
94
+
95
+ app_routes = [
96
+ Mount("/ws", app=socketio_app), # Mount Socket.IO server under /app
97
+ Mount(
98
+ "/", app=StaticFiles(directory=CLIENT_BUILD_PATH, html=True)
99
+ ), # Serve static files from root
100
+ ]
101
+ app = Starlette(debug=True, routes=app_routes)
102
+
103
+ # rooms is indexed by room_id
104
+ rooms: Dict[str, Room] = {}
105
+
106
+
107
+ class MemberDirectoryObject(TypedDict):
108
+ room: Room
109
+ member_object: Member
110
+
111
+
112
+ # member_directory is indexed by client_id
113
+ # NOTE: client_id is really "client session id", meaning that it is unique to a single browser session.
114
+ # If a user opens a new tab, they will have a different client_id and can join another room, join
115
+ # the same room with different roles, etc.
116
+ # NOTE: For a long-running production server we would want to clean up members after a certain timeout
117
+ # but for this limited application we can just keep them around
118
+ member_directory: Dict[str, MemberDirectoryObject] = {}
119
+
120
+
121
+ class ServerLock(TypedDict):
122
+ name: str
123
+ client_id: str
124
+ member_object: Member
125
+
126
+ MAX_SPEAKERS = os.environ.get("MAX_SPEAKERS")
127
+
128
+ if os.environ.get("LOCK_SERVER_COMPLETELY", "0") == "1":
129
+ logger.info("LOCK_SERVER_COMPLETELY is set. Server will be locked on startup.")
130
+ if MAX_SPEAKERS is not None and int(MAX_SPEAKERS):
131
+ logger.info(f"MAX_SPEAKERS is set to: {MAX_SPEAKERS}")
132
+ dummy_server_lock_member_object = Member(
133
+ client_id="seamless_user", session_id="dummy", name="Seamless User"
134
+ )
135
+ # Normally this would be an actual transcoder, but it's fine putting True here since currently we only check for the presence of the transcoder
136
+ dummy_server_lock_member_object.transcoder = True
137
+ server_lock: Optional[ServerLock] = (
138
+ {
139
+ "name": "Seamless User",
140
+ "client_id": "seamless_user",
141
+ "member_object": dummy_server_lock_member_object,
142
+ }
143
+ if os.environ.get("LOCK_SERVER_COMPLETELY", "0") == "1"
144
+ else None
145
+ )
146
+
147
+ server_id = str(uuid4())
148
+
149
+ # Specify specific models to load (some environments have issues loading multiple models)
150
+ # See AgentWithInfo with JSON format details.
151
+ models_override = os.environ.get("MODELS_OVERRIDE")
152
+
153
+ available_agents = SimulevalAgentDirectory()
154
+ logger.info("Building and adding agents...")
155
+ if models_override is not None:
156
+ logger.info(f"MODELS_OVERRIDE supplied from env vars: {models_override}")
157
+ available_agents.build_and_add_agents(models_override)
158
+
159
+ agents_capabilities_for_json = available_agents.get_agents_capabilities_list_for_json()
160
+
161
+
162
+ ###############################################
163
+ # Helpers
164
+ ###############################################
165
+
166
+
167
+ def catch_and_log_exceptions_for_sio_event_handlers(func):
168
+ # wrapper should have the same signature as the original function
169
+ async def catch_exception_wrapper(*args, **kwargs):
170
+ try:
171
+ return await func(*args, **kwargs)
172
+ except Exception as e:
173
+ message = f"[app_pubsub] Caught exception in '{func.__name__}' event handler:\n\n{e}"
174
+ logger.exception(message, stack_info=True)
175
+
176
+ try:
177
+ exception_data = {
178
+ "message": message,
179
+ "timeEpochMs": int(time.time() * 1000),
180
+ }
181
+
182
+ try:
183
+ # Let's try to add as much useful metadata as possible to the server_exception event
184
+ sid = args[0]
185
+ if isinstance(sid, str) and len(sid) > 0:
186
+ session_data = await get_session_data(sid)
187
+ if session_data:
188
+ client_id = session_data.get("client_id")
189
+ member = session_data.get("member_object")
190
+ room = session_data.get("room_object")
191
+
192
+ exception_data["room"] = str(room)
193
+ exception_data["member"] = str(member)
194
+ exception_data["clientID"] = str(client_id)
195
+ except Exception as inner_e:
196
+ # We expect there will be times when clientID or other values aren't present, so just log this as a warning
197
+ logger.warn(
198
+ f"[app_pubsub] Caught exception while trying add additional_data to server_exception:\n\n{inner_e}"
199
+ )
200
+
201
+ # For now let's emit this to all clients. We ultimatley may want to emit it just to the room it's happening in.
202
+ await sio.emit("server_exception", exception_data)
203
+ except Exception as inner_e:
204
+ logger.exception(
205
+ f"[app_pubsub] Caught exception while trying to emit server_exception event:\n{inner_e}"
206
+ )
207
+
208
+ # Re-raise the exception so it's handled normally by the server
209
+ raise e
210
+
211
+ # Set the name of the wrapper to the name of the original function so that the socketio server can associate it with the right event
212
+ catch_exception_wrapper.__name__ = func.__name__
213
+ return catch_exception_wrapper
214
+
215
+
216
+ async def emit_room_state_update(room):
217
+ await sio.emit(
218
+ "room_state_update",
219
+ room.to_json(),
220
+ room=room.room_id,
221
+ )
222
+
223
+
224
+ async def emit_server_state_update():
225
+ room_statuses = {
226
+ room_id: room.get_room_status_dict() for room_id, room in rooms.items()
227
+ }
228
+ total_active_connections = sum(
229
+ [room_status["activeConnections"] for room_status in room_statuses.values()]
230
+ )
231
+ total_active_transcoders = sum(
232
+ [room_status["activeTranscoders"] for room_status in room_statuses.values()]
233
+ )
234
+ logger.info(
235
+ f"[Server Status]: {total_active_connections} active connections (in rooms); {total_active_transcoders} active transcoders"
236
+ )
237
+ logger.info(f"[Server Status]: server_lock={server_lock}")
238
+ server_lock_object_for_js = (
239
+ {
240
+ "name": server_lock.get("name"),
241
+ "clientID": server_lock.get("client_id"),
242
+ "isActive": server_lock.get("member_object")
243
+ and server_lock.get("member_object").transcoder is not None,
244
+ }
245
+ if server_lock
246
+ else None
247
+ )
248
+ await sio.emit(
249
+ "server_state_update",
250
+ {
251
+ "statusByRoom": room_statuses,
252
+ "totalActiveConnections": total_active_connections,
253
+ "totalActiveTranscoders": total_active_transcoders,
254
+ "agentsCapabilities": agents_capabilities_for_json,
255
+ "serverLock": server_lock_object_for_js,
256
+ },
257
+ room=ALL_ROOM_ID,
258
+ )
259
+
260
+
261
+ async def get_session_data(sid):
262
+ session = await sio.get_session(sid)
263
+ # It seems like if the session has not been set that get_session may return None, so let's provide a fallback empty dictionary here
264
+ return session or {}
265
+
266
+
267
+ async def set_session_data(sid, client_id, room_id, room_object, member_object):
268
+ await sio.save_session(
269
+ sid,
270
+ {
271
+ "client_id": client_id,
272
+ "room_id": room_id,
273
+ "room_object": room_object,
274
+ "member_object": member_object,
275
+ },
276
+ )
277
+
278
+
279
+ def get_random_room_id():
280
+ return "".join(random.choices(ROOM_ID_USABLE_CHARACTERS, k=ROOM_ID_LENGTH))
281
+
282
+
283
+ def get_random_unused_room_id():
284
+ room_id = get_random_room_id()
285
+ while room_id in rooms:
286
+ room_id = get_random_room_id()
287
+ return room_id
288
+
289
+
290
+ ###############################################
291
+ # Socket.io Basic Event Handlers
292
+ ###############################################
293
+
294
+
295
+ @sio.on("connect")
296
+ @catch_and_log_exceptions_for_sio_event_handlers
297
+ async def connect(sid, environ):
298
+ logger.info(f"📥 [event: connected] sid={sid}")
299
+
300
+ # TODO: Sanitize/validate query param input
301
+ query_params = dict(parse.parse_qsl(environ["QUERY_STRING"]))
302
+ client_id = query_params.get("clientID")
303
+
304
+ logger.debug(f"query_params:\n{pformat(query_params)}")
305
+
306
+ if client_id is None:
307
+ logger.info("No clientID provided. Disconnecting...")
308
+ await sio.disconnect(sid)
309
+ return
310
+
311
+ # On reconnect we need to rejoin rooms and reset session data
312
+ if member_directory.get(client_id):
313
+ room = member_directory[client_id].get("room")
314
+ room_id = room.room_id
315
+ # Note: We could also get this from room.members[client_id]
316
+ member = member_directory[client_id].get("member_object")
317
+
318
+ member.connection_status = "connected"
319
+ member.session_id = sid
320
+
321
+ logger.info(
322
+ f"[event: connect] {member} reconnected. Attempting to re-add them to socketio rooms and reset session data."
323
+ )
324
+
325
+ if room is None or member is None:
326
+ logger.error(
327
+ f"[event: connect] {client_id} is reconnecting, but room or member is None. This should not happen."
328
+ )
329
+ await sio.disconnect(sid)
330
+ return
331
+
332
+ sio.enter_room(sid, room_id)
333
+ sio.enter_room(sid, ALL_ROOM_ID)
334
+
335
+ if client_id in room.listeners:
336
+ sio.enter_room(sid, f"{room_id}{ROOM_LISTENERS_SUFFIX}")
337
+ if client_id in room.speakers:
338
+ sio.enter_room(sid, f"{room_id}{ROOM_SPEAKERS_SUFFIX}")
339
+
340
+ # Save the room_id to the socketio client session
341
+ await set_session_data(
342
+ sid,
343
+ client_id=client_id,
344
+ room_id=room.room_id,
345
+ room_object=room,
346
+ member_object=member,
347
+ )
348
+ await emit_room_state_update(room)
349
+ else:
350
+ # Save the client id to the socketio client session
351
+ await set_session_data(
352
+ sid, client_id=client_id, room_id=None, room_object=None, member_object=None
353
+ )
354
+
355
+ await sio.emit("server_id", server_id, to=sid)
356
+ await emit_server_state_update()
357
+
358
+
359
+ @sio.event
360
+ @catch_and_log_exceptions_for_sio_event_handlers
361
+ async def disconnect(sid):
362
+ global server_lock
363
+ session_data = await get_session_data(sid)
364
+ # logger.info("session_data", session_data)
365
+
366
+ client_id = None
367
+ member = None
368
+ room = None
369
+
370
+ if session_data:
371
+ client_id = session_data.get("client_id")
372
+ member = session_data.get("member_object")
373
+ room = session_data.get("room_object")
374
+
375
+ logger.info(
376
+ f"[event: disconnect][{room or 'NOT_IN_ROOM'}] member: {member or 'NO_MEMBER_OBJECT'} disconnected"
377
+ )
378
+
379
+ # Release the lock if this is the client that holds the current server lock
380
+ if server_lock and server_lock.get("client_id") == client_id:
381
+ server_lock = None
382
+
383
+ if member:
384
+ member.connection_status = "disconnected"
385
+
386
+ if member.transcoder:
387
+ member.transcoder.close = True
388
+ member.transcoder = None
389
+ member.requested_output_type = None
390
+
391
+ if room:
392
+ logger.info(
393
+ f"[event: disconnect] {member} disconnected from room {room.room_id}"
394
+ )
395
+ await emit_room_state_update(room)
396
+ else:
397
+ logger.info(
398
+ f"[event: disconnect] {member} disconnected, but no room object present. This should not happen."
399
+ )
400
+ else:
401
+ logger.info(
402
+ f"[event: disconnect] client_id {client_id or 'NO_CLIENT_ID'} with sid {sid} in rooms {str(sio.rooms(sid))} disconnected"
403
+ )
404
+
405
+ await emit_server_state_update()
406
+
407
+
408
+ @sio.on("*")
409
+ async def catch_all(event, sid, data):
410
+ logger.info(f"[unhandled event: {event}] sid={sid} data={data}")
411
+
412
+
413
+ ###############################################
414
+ # Socket.io Streaming Event handlers
415
+ ###############################################
416
+
417
+
418
+ @sio.on("join_room")
419
+ @catch_and_log_exceptions_for_sio_event_handlers
420
+ async def join_room(sid, client_id, room_id_from_client, config_dict):
421
+ global server_lock
422
+
423
+ args = {
424
+ "sid": sid,
425
+ "client_id": client_id,
426
+ "room_id": room_id_from_client,
427
+ "config_dict": config_dict,
428
+ }
429
+ logger.info(f"[event: join_room] {args}")
430
+ session_data = await get_session_data(sid)
431
+ logger.info(f"session_data: {session_data}")
432
+
433
+ room_id = room_id_from_client
434
+ if room_id is None:
435
+ room_id = get_random_unused_room_id()
436
+ logger.info(
437
+ f"No room_id provided. Generating a random, unused room_id: {room_id}"
438
+ )
439
+
440
+ # Create the room if it doesn't already exist
441
+ if room_id not in rooms:
442
+ rooms[room_id] = Room(room_id)
443
+
444
+ room = rooms[room_id]
445
+
446
+ member = None
447
+
448
+ name = "[NO_NAME]"
449
+
450
+ # If the client is reconnecting use their existing member object. Otherwise create a new one.
451
+ if client_id in room.members:
452
+ member = room.members[client_id]
453
+ logger.info(f"{member} is rejoining room {room_id}.")
454
+ else:
455
+ member_number = len(room.members) + 1
456
+ name = f"Member {member_number}"
457
+ member = Member(
458
+ client_id=client_id,
459
+ session_id=sid,
460
+ name=name,
461
+ )
462
+ logger.info(f"Created a new Member object: {member}")
463
+ logger.info(f"Adding {member} to room {room_id}")
464
+ room.members[client_id] = member
465
+
466
+ # Also add them to the member directory
467
+ member_directory[client_id] = {"room": room, "member_object": member}
468
+
469
+ # Join the socketio room, which enables broadcasting to all members of the room
470
+ sio.enter_room(sid, room_id)
471
+ # Join the room for all clients
472
+ sio.enter_room(sid, ALL_ROOM_ID)
473
+
474
+ if "listener" in config_dict["roles"]:
475
+ sio.enter_room(sid, f"{room_id}{ROOM_LISTENERS_SUFFIX}")
476
+ if client_id not in room.listeners:
477
+ room.listeners.append(client_id)
478
+ else:
479
+ sio.leave_room(sid, f"{room_id}{ROOM_LISTENERS_SUFFIX}")
480
+ room.listeners = [
481
+ listener_id for listener_id in room.listeners if listener_id != client_id
482
+ ]
483
+
484
+ if "speaker" in config_dict["roles"]:
485
+ sio.enter_room(sid, f"{room_id}{ROOM_SPEAKERS_SUFFIX}")
486
+ if client_id not in room.speakers:
487
+ room.speakers.append(client_id)
488
+ else:
489
+ sio.leave_room(sid, f"{room_id}{ROOM_SPEAKERS_SUFFIX}")
490
+ # If the person is no longer a speaker they should no longer be able to lock the server
491
+ if server_lock and server_lock.get("client_id") == client_id:
492
+ logger.info(
493
+ f"🔓 Server is now unlocked from client {server_lock.get('client_id')} with name/info: {server_lock.get('name')}"
494
+ )
495
+ server_lock = None
496
+ if member.transcoder:
497
+ member.transcoder.close = True
498
+ member.transcoder = None
499
+ room.speakers = [
500
+ speaker_id for speaker_id in room.speakers if speaker_id != client_id
501
+ ]
502
+
503
+ # If we currently own the server lock and are updating roles and we no longer have server lock specified, release it
504
+ if (
505
+ server_lock is not None
506
+ and server_lock["client_id"] == client_id
507
+ and config_dict.get("lockServerName") is None
508
+ ):
509
+ logger.info(f"[join_room] Releasing server lock: {pformat(server_lock)}")
510
+ server_lock = None
511
+
512
+ # Only speakers should be able to lock the server
513
+ if config_dict.get("lockServerName") is not None and "speaker" in config_dict.get(
514
+ "roles", {}
515
+ ):
516
+ # If something goes wrong and the server gets stuck in a locked state the client can
517
+ # force the server to remove the lock by passing the special name ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME
518
+ if (
519
+ server_lock is not None
520
+ and config_dict.get("lockServerName")
521
+ == ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME
522
+ # If we are locking the server completely we don't want someone to be able to unlock it
523
+ and not os.environ.get("LOCK_SERVER_COMPLETELY", "0") == "1"
524
+ ):
525
+ server_lock = None
526
+ logger.info(
527
+ f"🔓 Server lock has been reset by {client_id} using the escape hatch name {ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME}"
528
+ )
529
+
530
+ # If the server is not locked, set a lock. If it's already locked to this client, update the lock object
531
+ if server_lock is None or server_lock.get("client_id") == client_id:
532
+ # TODO: Add some sort of timeout as a backstop in case someone leaves the browser tab open after locking the server
533
+ server_lock = {
534
+ "name": config_dict.get("lockServerName"),
535
+ "client_id": client_id,
536
+ "member_object": member,
537
+ }
538
+ logger.info(
539
+ f"🔒 Server is now locked to client {server_lock.get('client_id')} with name/info: {server_lock.get('name')}\nThis client will have priority over all others until they disconnect."
540
+ )
541
+ # If the server is already locked to someone else, don't allow this client to lock it
542
+ elif server_lock is not None and server_lock.get("client_id") != client_id:
543
+ logger.warn(
544
+ f"⚠️ Server is already locked to client {server_lock.get('client_id')}. Ignoring request to lock to client {client_id}."
545
+ )
546
+ # TODO: Maybe throw an error here?
547
+
548
+ # Save the room_id to the socketio client session
549
+ await set_session_data(
550
+ sid,
551
+ client_id=client_id,
552
+ room_id=room_id,
553
+ room_object=room,
554
+ member_object=member,
555
+ )
556
+
557
+ await emit_room_state_update(room)
558
+ await emit_server_state_update()
559
+
560
+ return {"roomsJoined": sio.rooms(sid), "roomID": room_id}
561
+
562
+ def allow_speaker(room, client_id):
563
+ if MAX_SPEAKERS is not None and client_id in room.speakers:
564
+ room_statuses = {room_id: room.get_room_status_dict() for room_id, room in rooms.items()}
565
+ speakers = sum(room_status["activeTranscoders"] for room_status in room_statuses.values())
566
+ return speakers < int(MAX_SPEAKERS)
567
+ return True
568
+
569
+ # TODO: Add code to prevent more than one speaker from connecting/streaming at a time
570
+ @sio.event
571
+ @catch_and_log_exceptions_for_sio_event_handlers
572
+ async def configure_stream(sid, config):
573
+ session_data = await get_session_data(sid)
574
+ client_id, member, room = itemgetter("client_id", "member_object", "room_object")(
575
+ session_data
576
+ )
577
+
578
+ logger.debug(
579
+ f"[event: configure_stream][{room}] Received stream config from {member}\n{pformat(config)}"
580
+ )
581
+
582
+ if member is None or room is None:
583
+ logger.error(
584
+ f"Received stream config from {member}, but member or room is None. This should not happen."
585
+ )
586
+ return {"status": "error", "message": "member_or_room_is_none"}
587
+
588
+ if not allow_speaker(room, client_id):
589
+ logger.error(
590
+ f"In MAX_SPEAKERS mode we only allow one speaker at a time. Ignoring request to configure stream from client {client_id}."
591
+ )
592
+ return {"status": "error", "message": "max_speakers"}
593
+
594
+ # If there is a server lock WITH an active transcoder session, prevent other users from configuring and starting a stream
595
+ # If the server lock client does NOT have an active transcoder session allow this to proceed, knowing that
596
+ # this stream will be interrupted if the server lock client starts streaming
597
+ if (
598
+ server_lock is not None
599
+ and server_lock.get("client_id") != client_id
600
+ and server_lock.get("member_object")
601
+ and server_lock.get("member_object").transcoder is not None
602
+ ):
603
+ logger.warn(
604
+ f"Server is locked to client {server_lock.get('client_id')}. Ignoring request to configure stream from client {client_id}."
605
+ )
606
+ return {"status": "error", "message": "server_locked"}
607
+
608
+ debug = config.get("debug")
609
+ async_processing = config.get("async_processing")
610
+
611
+ # Currently s2s, s2t or s2s&t
612
+ model_type = config.get("model_type")
613
+ member.requested_output_type = model_type
614
+
615
+ model_name = config.get("model_name")
616
+
617
+ try:
618
+ agent = available_agents.get_agent_or_throw(model_name)
619
+ except NoAvailableAgentException as e:
620
+ logger.warn(f"Error while getting agent: {e}")
621
+ # await sio.emit("error", str(e), to=sid)
622
+ await sio.disconnect(sid)
623
+ return {"status": "error", "message": str(e)}
624
+
625
+ if member.transcoder:
626
+ logger.warn(
627
+ "Member already has a transcoder configured. Closing it, and overwriting with a new transcoder..."
628
+ )
629
+ member.transcoder.close = True
630
+
631
+ t0 = time.time()
632
+ try:
633
+ member.transcoder = SimulevalTranscoder(
634
+ agent,
635
+ config["rate"],
636
+ debug=debug,
637
+ buffer_limit=int(config["buffer_limit"]),
638
+ )
639
+ except Exception as e:
640
+ logger.warn(f"Got exception while initializing agents: {e}")
641
+ # await sio.emit("error", str(e), to=sid)
642
+ await sio.disconnect(sid)
643
+ return {"status": "error", "message": str(e)}
644
+
645
+ t1 = time.time()
646
+ logger.debug(f"Booting up VAD and transcoder took {t1-t0} sec")
647
+
648
+ # TODO: if async_processing is false, then we need to run transcoder.process_pipeline_once() whenever we receive audio, or at some other sensible interval
649
+ if async_processing:
650
+ member.transcoder.start()
651
+
652
+ # We need to emit a room state update here since room state now includes # of active transcoders
653
+ await emit_room_state_update(room)
654
+ await emit_server_state_update()
655
+
656
+ return {"status": "ok", "message": "server_ready"}
657
+
658
+
659
+ # The config here is a partial config, meaning it may not contain all the config values -- only the ones the user
660
+ # wants to change
661
+ @sio.on("set_dynamic_config")
662
+ @catch_and_log_exceptions_for_sio_event_handlers
663
+ async def set_dynamic_config(
664
+ sid,
665
+ # partial_config's type is defined in StreamingTypes.ts
666
+ partial_config,
667
+ ):
668
+ session_data = await get_session_data(sid)
669
+
670
+ member = None
671
+
672
+ if session_data:
673
+ member = session_data.get("member_object")
674
+
675
+ if member:
676
+ new_dynamic_config = {
677
+ **(member.transcoder_dynamic_config or {}),
678
+ **partial_config,
679
+ }
680
+ logger.info(
681
+ f"[set_dynamic_config] Setting new dynamic config:\n\n{pformat(new_dynamic_config)}\n"
682
+ )
683
+ member.transcoder_dynamic_config = new_dynamic_config
684
+
685
+ return {"status": "ok", "message": "dynamic_config_set"}
686
+
687
+
688
+ @sio.event
689
+ @catch_and_log_exceptions_for_sio_event_handlers
690
+ async def incoming_audio(sid, blob):
691
+ session_data = await get_session_data(sid)
692
+
693
+ client_id = None
694
+ member = None
695
+ room = None
696
+
697
+ if session_data:
698
+ client_id = session_data.get("client_id")
699
+ member = session_data.get("member_object")
700
+ room = session_data.get("room_object")
701
+
702
+ logger.debug(f"[event: incoming_audio] from member {member}")
703
+
704
+ # If the server is locked by someone else, kill our transcoder and ignore incoming audio
705
+ # If the server lock client does NOT have an active transcoder session allow this incoming audio pipeline to proceed,
706
+ # knowing that this stream will be interrupted if the server lock client starts streaming
707
+ if (
708
+ server_lock is not None
709
+ and server_lock.get("client_id") != client_id
710
+ and server_lock.get("member_object")
711
+ and server_lock.get("member_object").transcoder is not None
712
+ ):
713
+ # TODO: Send an event to the client to let them know their streaming session has been killed
714
+ if member.transcoder:
715
+ member.transcoder.close = True
716
+ member.transcoder = None
717
+ # Update both room state and server state given that the number of active transcoders has changed
718
+ if room:
719
+ await emit_room_state_update(room)
720
+ await emit_server_state_update()
721
+ logger.warn(
722
+ f"[incoming_audio] Server is locked to client {server_lock.get('client_id')}. Ignoring incoming audio from client {client_id}."
723
+ )
724
+ return
725
+
726
+ if member is None or room is None:
727
+ logger.error(
728
+ f"[incoming_audio] Received incoming_audio from {member}, but member or room is None. This should not happen."
729
+ )
730
+ return
731
+
732
+ # NOTE: bytes and bytearray are very similar, but bytes is immutable, and is what is returned by socketio
733
+ if not isinstance(blob, bytes):
734
+ logger.error(
735
+ f"[incoming_audio] Received audio from {member}, but it was not of type `bytes`. type(blob) = {type(blob)}"
736
+ )
737
+ return
738
+
739
+ if member.transcoder is None:
740
+ logger.error(
741
+ f"[incoming_audio] Received audio from {member}, but no transcoder configured to process it (member.transcoder is None). This should not happen."
742
+ )
743
+ return
744
+
745
+ member.transcoder.process_incoming_bytes(
746
+ blob, dynamic_config=member.transcoder_dynamic_config
747
+ )
748
+
749
+ # Send back any available model output
750
+ # NOTE: In theory it would make sense remove this from the incoming_audio handler and
751
+ # handle this in a dedicated thread that checks for output and sends it right away,
752
+ # but in practice for our limited demo use cases this approach didn't add noticeable
753
+ # latency, so we're keeping it simple for now.
754
+ events = get_transcoder_output_events(member.transcoder)
755
+ logger.debug(f"[incoming_audio] transcoder output events: {len(events)}")
756
+
757
+ if len(events) == 0:
758
+ logger.debug("[incoming_audio] No transcoder output to send")
759
+ else:
760
+ for e in events:
761
+ if e["event"] == "translation_speech" and member.requested_output_type in [
762
+ "s2s",
763
+ "s2s&t",
764
+ ]:
765
+ logger.debug("[incoming_audio] Sending translation_speech event")
766
+ await sio.emit(
767
+ "translation_speech", e, room=f"{room.room_id}_listeners"
768
+ )
769
+ elif e["event"] == "translation_text" and member.requested_output_type in [
770
+ "s2t",
771
+ "s2s&t",
772
+ ]:
773
+ logger.debug("[incoming_audio] Sending translation_text event")
774
+ await sio.emit("translation_text", e, room=f"{room.room_id}_listeners")
775
+ else:
776
+ logger.error(f"[incoming_audio] Unexpected event type: {e['event']}")
777
+
778
+ return
779
+
780
+
781
+ @sio.event
782
+ @catch_and_log_exceptions_for_sio_event_handlers
783
+ async def stop_stream(sid):
784
+ session_data = await get_session_data(sid)
785
+ client_id, member, room = itemgetter("client_id", "member_object", "room_object")(
786
+ session_data
787
+ )
788
+
789
+ logger.debug(f"[event: stop_stream][{room}] Attempting to stop stream for {member}")
790
+
791
+ if member is None or room is None:
792
+ message = f"Received stop_stream from {member}, but member or room is None. This should not happen."
793
+ logger.error(message)
794
+ return {"status": "error", "message": message}
795
+
796
+ # In order to stop the stream and end the transcoder thread, set close to True and unset it for the member
797
+ if member.transcoder:
798
+ member.transcoder.close = True
799
+ member.transcoder = None
800
+ else:
801
+ message = f"Received stop_stream from {member}, but member.transcoder is None. This should not happen."
802
+ logger.warn(message)
803
+
804
+ # We need to emit a room state update here since room state now includes # of active transcoders
805
+ await emit_room_state_update(room)
806
+ # Emit a server state update now that we've changed the number of active transcoders
807
+ await emit_server_state_update()
808
+
809
+ return {"status": "ok", "message": "Stream stopped"}
810
+
811
+
812
+ @sio.on("clear_transcript_for_all")
813
+ @catch_and_log_exceptions_for_sio_event_handlers
814
+ async def clear_transcript_for_all(sid):
815
+ session_data = await get_session_data(sid)
816
+
817
+ room = session_data.get("room_object")
818
+
819
+ if room:
820
+ await sio.emit("clear_transcript", room=f"{room.room_id}")
821
+ else:
822
+ logger.error("[clear_transcript] room is None. This should not happen.")
823
+
824
+
825
+ @sio.event
826
+ @catch_and_log_exceptions_for_sio_event_handlers
827
+ async def set_name(sid, name):
828
+ logger.info(f"[Event: set_name] name={name}")
829
+ await sio.save_session(sid, {"name": name})
seamless_server/models/.DS_Store ADDED
Binary file (6.15 kB). View file
 
seamless_server/models/Seamless/vad_s2st_sc_24khz_main.yaml ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agent_class: seamless_communication.streaming.agents.seamless_s2st.SeamlessS2STDualVocoderVADAgent
2
+ monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
3
+ unity_model_name: seamless_streaming_unity
4
+ sentencepiece_model: spm_256k_nllb100.model
5
+
6
+ task: s2st
7
+ tgt_lang: "eng"
8
+ min_unit_chunk_size: 50
9
+ decision_threshold: 0.7
10
+ no_early_stop: True
11
+ block_ngrams: True
12
+ vocoder_name: vocoder_v2
13
+ expr_vocoder_name: vocoder_pretssel
14
+ gated_model_dir: .
15
+ expr_vocoder_gain: 3.0
16
+ upstream_idx: 1
17
+ wav2vec_yaml: wav2vec.yaml
18
+ min_starting_wait_w2vbert: 192
19
+
20
+ config_yaml: cfg_fbank_u2t.yaml
21
+ upstream_idx: 1
22
+ detokenize_only: True
23
+ device: cuda:0
24
+ max_len_a: 0
25
+ max_len_b: 1000
seamless_server/models/SeamlessStreaming/vad_s2st_sc_main.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agent_class: seamless_communication.streaming.agents.seamless_streaming_s2st.SeamlessStreamingS2STJointVADAgent
2
+ monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
3
+ unity_model_name: seamless_streaming_unity
4
+ sentencepiece_model: spm_256k_nllb100.model
5
+
6
+ task: s2st
7
+ tgt_lang: "eng"
8
+ min_unit_chunk_size: 50
9
+ decision_threshold: 0.7
10
+ no_early_stop: True
11
+ block_ngrams: True
12
+ vocoder_name: vocoder_v2
13
+ wav2vec_yaml: wav2vec.yaml
14
+ min_starting_wait_w2vbert: 192
15
+
16
+ config_yaml: cfg_fbank_u2t.yaml
17
+ upstream_idx: 1
18
+ detokenize_only: True
19
+ device: cuda:0
20
+ max_len_a: 0
21
+ max_len_b: 1000
seamless_server/requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # seamless_communication
2
+ git+https://github.com/facebookresearch/seamless_communication.git
3
+ # ./whl/seamless_communication-1.0.0-py3-none-any.whl
4
+ Flask==2.1.3
5
+ Flask_Sockets==0.2.1
6
+ g2p_en==2.1.0
7
+ gevent==22.10.2
8
+ gevent_websocket==0.10.1
9
+ librosa==0.9.2
10
+ numpy==1.24.4
11
+ openai_whisper==20230124
12
+ protobuf==4.24.2
13
+ psola==0.0.1
14
+ pydub==0.25.1
15
+ silero==0.4.1
16
+ soundfile==0.11.0
17
+ stable_ts==1.4.0
18
+ # torch # to be installed by user for desired PyTorch version
19
+ # simuleval # to be installed by seamless_communication
20
+ Werkzeug==2.0.3
21
+ whisper==1.1.10
22
+ colorlog==6.7.0
23
+ python-socketio==5.9.0
24
+ uvicorn[standard]==0.23.2
25
+ parallel-wavegan==0.5.5
26
+ python-jose[cryptography]==3.3.0
27
+ starlette==0.32.0.post1
28
+ hf_transfer==0.1.4
29
+ huggingface_hub==0.19.4
seamless_server/run_docker.sh ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # !/bin/bash
2
+ if [ -f models/Seamless/pretssel_melhifigan_wm.pt ] ; then
3
+ export USE_EXPRESSIVE_MODEL=1;
4
+ fi
5
+ uvicorn app_pubsub:app --host 0.0.0.0 --port 7860
seamless_server/src/room.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ import uuid
3
+
4
+
5
+ class Room:
6
+ def __init__(self, room_id) -> None:
7
+ self.room_id = room_id
8
+ # members is a dict from client_id to Member
9
+ self.members = {}
10
+
11
+ # listeners and speakers are lists of client_id's
12
+ self.listeners = []
13
+ self.speakers = []
14
+
15
+ def __str__(self) -> str:
16
+ return f"Room {self.room_id} ({len(self.members)} member{'s' if len(self.members) == 1 else ''})"
17
+
18
+ def to_json(self):
19
+ varsResult = vars(self)
20
+ # Remember: result is just a shallow copy, so result.members === self.members
21
+ # Because of that, we need to jsonify self.members without writing over result.members,
22
+ # which we do here via dictionary unpacking (the ** operator)
23
+ result = {
24
+ **varsResult,
25
+ "members": {key: value.to_json() for (key, value) in self.members.items()},
26
+ "activeTranscoders": self.get_active_transcoders(),
27
+ }
28
+
29
+ return result
30
+
31
+ def get_active_connections(self):
32
+ return len(
33
+ [m for m in self.members.values() if m.connection_status == "connected"]
34
+ )
35
+
36
+ def get_active_transcoders(self):
37
+ return len([m for m in self.members.values() if m.transcoder is not None])
38
+
39
+ def get_room_status_dict(self):
40
+ return {
41
+ "activeConnections": self.get_active_connections(),
42
+ "activeTranscoders": self.get_active_transcoders(),
43
+ }
44
+
45
+
46
+ class Member:
47
+ def __init__(self, client_id, session_id, name) -> None:
48
+ self.client_id = client_id
49
+ self.session_id = session_id
50
+ self.name = name
51
+ self.connection_status = "connected"
52
+ self.transcoder = None
53
+ self.requested_output_type = None
54
+ self.transcoder_dynamic_config = None
55
+
56
+ def __str__(self) -> str:
57
+ return f"{self.name} (id: {self.client_id[:4]}...) ({self.connection_status})"
58
+
59
+ def to_json(self):
60
+ self_vars = vars(self)
61
+ return {
62
+ **self_vars,
63
+ "transcoder": self.transcoder is not None,
64
+ }
seamless_server/src/simuleval_agent_directory.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Creates a directory in which to look up available agents
2
+
3
+ import os
4
+ from typing import List, Optional
5
+ from src.simuleval_transcoder import SimulevalTranscoder
6
+ import json
7
+ import logging
8
+
9
+ logger = logging.getLogger("socketio_server_pubsub")
10
+
11
+ # fmt: off
12
+ M4T_P0_LANGS = [
13
+ "eng",
14
+ "arb", "ben", "cat", "ces", "cmn", "cym", "dan",
15
+ "deu", "est", "fin", "fra", "hin", "ind", "ita",
16
+ "jpn", "kor", "mlt", "nld", "pes", "pol", "por",
17
+ "ron", "rus", "slk", "spa", "swe", "swh", "tel",
18
+ "tgl", "tha", "tur", "ukr", "urd", "uzn", "vie",
19
+ ]
20
+ # fmt: on
21
+
22
+
23
+ class NoAvailableAgentException(Exception):
24
+ pass
25
+
26
+
27
+ class AgentWithInfo:
28
+ def __init__(
29
+ self,
30
+ agent,
31
+ name: str,
32
+ modalities: List[str],
33
+ target_langs: List[str],
34
+ # Supported dynamic params are defined in StreamingTypes.ts
35
+ dynamic_params: List[str] = [],
36
+ description="",
37
+ has_expressive: Optional[bool] = None,
38
+ ):
39
+ self.agent = agent
40
+ self.has_expressive = has_expressive
41
+ self.name = name
42
+ self.description = description
43
+ self.modalities = modalities
44
+ self.target_langs = target_langs
45
+ self.dynamic_params = dynamic_params
46
+
47
+ def get_capabilities_for_json(self):
48
+ return {
49
+ "name": self.name,
50
+ "description": self.description,
51
+ "modalities": self.modalities,
52
+ "targetLangs": self.target_langs,
53
+ "dynamicParams": self.dynamic_params,
54
+ }
55
+
56
+ @classmethod
57
+ def load_from_json(cls, config: str):
58
+ """
59
+ Takes in JSON array of models to load in, e.g.
60
+ [{"name": "s2s_m4t_emma-unity2_multidomain_v0.1", "description": "M4T model that supports simultaneous S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]},
61
+ {"name": "s2s_m4t_expr-emma_v0.1", "description": "ES-EN expressive model that supports S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]}]
62
+ """
63
+ configs = json.loads(config)
64
+ agents = []
65
+ for config in configs:
66
+ agent = SimulevalTranscoder.build_agent(config["name"])
67
+ agents.append(
68
+ AgentWithInfo(
69
+ agent=agent,
70
+ name=config["name"],
71
+ modalities=config["modalities"],
72
+ target_langs=config["targetLangs"],
73
+ )
74
+ )
75
+ return agents
76
+
77
+
78
+ class SimulevalAgentDirectory:
79
+ # Available models. These are the directories where the models can be found, and also serve as an ID for the model.
80
+ seamless_streaming_agent = "SeamlessStreaming"
81
+ seamless_agent = "Seamless"
82
+
83
+ def __init__(self):
84
+ self.agents = []
85
+ self.did_build_and_add_agents = False
86
+
87
+ def add_agent(self, agent: AgentWithInfo):
88
+ self.agents.append(agent)
89
+
90
+ def build_agent_if_available(self, model_id, config_name=None):
91
+ agent = None
92
+ try:
93
+ if config_name is not None:
94
+ agent = SimulevalTranscoder.build_agent(
95
+ model_id,
96
+ config_name=config_name,
97
+ )
98
+ else:
99
+ agent = SimulevalTranscoder.build_agent(
100
+ model_id,
101
+ )
102
+ except Exception as e:
103
+ from fairseq2.assets.error import AssetError
104
+ logger.warning("Failed to build agent %s: %s" % (model_id, e))
105
+ if isinstance(e, AssetError):
106
+ logger.warning(
107
+ "Please download gated assets and set `gated_model_dir` in the config"
108
+ )
109
+ raise e
110
+
111
+ return agent
112
+
113
+ def build_and_add_agents(self, models_override=None):
114
+ if self.did_build_and_add_agents:
115
+ return
116
+
117
+ if models_override is not None:
118
+ agent_infos = AgentWithInfo.load_from_json(models_override)
119
+ for agent_info in agent_infos:
120
+ self.add_agent(agent_info)
121
+ else:
122
+ s2s_agent = None
123
+ if os.environ.get("USE_EXPRESSIVE_MODEL", "0") == "1":
124
+ logger.info("Building expressive model...")
125
+ s2s_agent = self.build_agent_if_available(
126
+ SimulevalAgentDirectory.seamless_agent,
127
+ config_name="vad_s2st_sc_24khz_main.yaml",
128
+ )
129
+ has_expressive = True
130
+ else:
131
+ logger.info("Building non-expressive model...")
132
+ s2s_agent = self.build_agent_if_available(
133
+ SimulevalAgentDirectory.seamless_streaming_agent,
134
+ config_name="vad_s2st_sc_main.yaml",
135
+ )
136
+ has_expressive = False
137
+
138
+ if s2s_agent:
139
+ self.add_agent(
140
+ AgentWithInfo(
141
+ agent=s2s_agent,
142
+ name=SimulevalAgentDirectory.seamless_streaming_agent,
143
+ modalities=["s2t", "s2s"],
144
+ target_langs=M4T_P0_LANGS,
145
+ dynamic_params=["expressive"],
146
+ description="multilingual expressive model that supports S2S and S2T",
147
+ has_expressive=has_expressive,
148
+ )
149
+ )
150
+
151
+ if len(self.agents) == 0:
152
+ logger.error(
153
+ "No agents were loaded. This likely means you are missing the actual model files specified in simuleval_agent_directory."
154
+ )
155
+
156
+ self.did_build_and_add_agents = True
157
+
158
+ def get_agent(self, name):
159
+ for agent in self.agents:
160
+ if agent.name == name:
161
+ return agent
162
+ return None
163
+
164
+ def get_agent_or_throw(self, name):
165
+ agent = self.get_agent(name)
166
+ if agent is None:
167
+ raise NoAvailableAgentException("No agent found with name= %s" % (name))
168
+ return agent
169
+
170
+ def get_agents_capabilities_list_for_json(self):
171
+ return [agent.get_capabilities_for_json() for agent in self.agents]
seamless_server/src/simuleval_transcoder.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from simuleval.utils.agent import build_system_from_dir
2
+ from typing import Any, List, Optional, Tuple, Union
3
+ import numpy as np
4
+ import soundfile
5
+ import io
6
+ import asyncio
7
+ from simuleval.agents.pipeline import TreeAgentPipeline
8
+ from simuleval.agents.states import AgentStates
9
+ from simuleval.data.segments import Segment, EmptySegment, SpeechSegment
10
+ import threading
11
+ import math
12
+ import logging
13
+ import sys
14
+ from pathlib import Path
15
+ import time
16
+ from g2p_en import G2p
17
+ import torch
18
+ import traceback
19
+ import time
20
+ import random
21
+ import colorlog
22
+
23
+ from .speech_and_text_output import SpeechAndTextOutput
24
+
25
+ MODEL_SAMPLE_RATE = 16_000
26
+
27
+ logger = logging.getLogger(__name__)
28
+ # logger.propagate = False
29
+ handler = colorlog.StreamHandler(stream=sys.stdout)
30
+ formatter = colorlog.ColoredFormatter(
31
+ "%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
32
+ reset=True,
33
+ log_colors={
34
+ "DEBUG": "cyan",
35
+ "INFO": "green",
36
+ "WARNING": "yellow",
37
+ "ERROR": "red",
38
+ "CRITICAL": "red,bg_white",
39
+ },
40
+ )
41
+ handler.setFormatter(formatter)
42
+ logger.addHandler(handler)
43
+ logger.setLevel(logging.WARNING)
44
+
45
+
46
+ class OutputSegments:
47
+ def __init__(self, segments: Union[List[Segment], Segment]):
48
+ if isinstance(segments, Segment):
49
+ segments = [segments]
50
+ self.segments: List[Segment] = [s for s in segments]
51
+
52
+ @property
53
+ def is_empty(self):
54
+ return all(segment.is_empty for segment in self.segments)
55
+
56
+ @property
57
+ def finished(self):
58
+ return all(segment.finished for segment in self.segments)
59
+
60
+ def compute_length(self, g2p):
61
+ lengths = []
62
+ for segment in self.segments:
63
+ if segment.data_type == "text":
64
+ lengths.append(len([x for x in g2p(segment.content) if x != " "]))
65
+ elif segment.data_type == "speech":
66
+ lengths.append(len(segment.content) / MODEL_SAMPLE_RATE)
67
+ elif isinstance(segment, EmptySegment):
68
+ continue
69
+ else:
70
+ logger.warning(
71
+ f"Unexpected data_type: {segment.data_type} not in 'speech', 'text'"
72
+ )
73
+ return max(lengths)
74
+
75
+ @classmethod
76
+ def join_output_buffer(
77
+ cls, buffer: List[List[Segment]], output: SpeechAndTextOutput
78
+ ):
79
+ num_segments = len(buffer[0])
80
+ for i in range(num_segments):
81
+ segment_list = [
82
+ buffer[j][i]
83
+ for j in range(len(buffer))
84
+ if buffer[j][i].data_type is not None
85
+ ]
86
+ if len(segment_list) == 0:
87
+ continue
88
+ if len(set(segment.data_type for segment in segment_list)) != 1:
89
+ logger.warning(
90
+ f"Data type mismatch at {i}: {set(segment.data_type for segment in segment_list)}"
91
+ )
92
+ continue
93
+ data_type = segment_list[0].data_type
94
+ if data_type == "text":
95
+ if output.text is not None:
96
+ logger.warning("Multiple text outputs, overwriting!")
97
+ output.text = " ".join([segment.content for segment in segment_list])
98
+ elif data_type == "speech":
99
+ if output.speech_samples is not None:
100
+ logger.warning("Multiple speech outputs, overwriting!")
101
+ speech_out = []
102
+ for segment in segment_list:
103
+ speech_out += segment.content
104
+ output.speech_samples = speech_out
105
+ output.speech_sample_rate = segment.sample_rate
106
+ elif isinstance(segment_list[0], EmptySegment):
107
+ continue
108
+ else:
109
+ logger.warning(
110
+ f"Invalid output buffer data type: {data_type}, expected 'speech' or 'text"
111
+ )
112
+
113
+ return output
114
+
115
+ def __repr__(self) -> str:
116
+ repr_str = str(self.segments)
117
+ return f"{self.__class__.__name__}(\n\t{repr_str}\n)"
118
+
119
+
120
+ class SimulevalTranscoder:
121
+ def __init__(self, agent, sample_rate, debug, buffer_limit):
122
+ self.agent = agent.agent
123
+ self.has_expressive = agent.has_expressive
124
+ self.input_queue = asyncio.Queue()
125
+ self.output_queue = asyncio.Queue()
126
+ self.states = self.agent.build_states()
127
+ if debug:
128
+ self.get_states_root().debug = True
129
+ self.incoming_sample_rate = sample_rate
130
+ self.close = False
131
+ self.g2p = G2p()
132
+
133
+ # buffer all outgoing translations within this amount of time
134
+ self.output_buffer_idle_ms = 5000
135
+ self.output_buffer_size_limit = (
136
+ buffer_limit # phonemes for text, seconds for speech
137
+ )
138
+ self.output_buffer_cur_size = 0
139
+ self.output_buffer: List[List[Segment]] = []
140
+ self.speech_output_sample_rate = None
141
+
142
+ self.last_output_ts = time.time() * 1000
143
+ self.timeout_ms = (
144
+ 30000 # close the transcoder thread after this amount of silence
145
+ )
146
+ self.first_input_ts = None
147
+ self.first_output_ts = None
148
+ self.debug = debug
149
+ self.debug_ts = f"{time.time()}_{random.randint(1000, 9999)}"
150
+ if self.debug:
151
+ debug_folder = Path(__file__).resolve().parent.parent / "debug"
152
+ self.test_incoming_wav = soundfile.SoundFile(
153
+ debug_folder / f"{self.debug_ts}_test_incoming.wav",
154
+ mode="w+",
155
+ format="WAV",
156
+ subtype="PCM_16",
157
+ samplerate=self.incoming_sample_rate,
158
+ channels=1,
159
+ )
160
+ self.get_states_root().test_input_segments_wav = soundfile.SoundFile(
161
+ debug_folder / f"{self.debug_ts}_test_input_segments.wav",
162
+ mode="w+",
163
+ format="WAV",
164
+ samplerate=MODEL_SAMPLE_RATE,
165
+ channels=1,
166
+ )
167
+
168
+ def get_states_root(self) -> AgentStates:
169
+ if isinstance(self.agent, TreeAgentPipeline):
170
+ # self.states is a dict
171
+ return self.states[self.agent.source_module]
172
+ else:
173
+ # self.states is a list
174
+ return self.states[0]
175
+
176
+ def reset_states(self):
177
+ if isinstance(self.agent, TreeAgentPipeline):
178
+ states_iter = self.states.values()
179
+ else:
180
+ states_iter = self.states
181
+ for state in states_iter:
182
+ state.reset()
183
+
184
+ def debug_log(self, *args):
185
+ if self.debug:
186
+ logger.info(*args)
187
+
188
+ @classmethod
189
+ def build_agent(cls, model_path, config_name):
190
+ logger.info(f"Building simuleval agent: {model_path}, {config_name}")
191
+ agent = build_system_from_dir(
192
+ Path(__file__).resolve().parent.parent / f"models/{model_path}",
193
+ config_name=config_name,
194
+ )
195
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
196
+ agent.to(device, fp16=True)
197
+ logger.info(
198
+ f"Successfully built simuleval agent {model_path} on device {device}"
199
+ )
200
+
201
+ return agent
202
+
203
+ def process_incoming_bytes(self, incoming_bytes, dynamic_config):
204
+ # TODO: We probably want to do some validation on dynamic_config to ensure it has what we needs
205
+ segment, sr = self._preprocess_wav(incoming_bytes)
206
+ segment = SpeechSegment(
207
+ content=segment,
208
+ sample_rate=sr,
209
+ tgt_lang=dynamic_config.get("targetLanguage"),
210
+ config=dynamic_config,
211
+ )
212
+ if dynamic_config.get("expressive") is True and self.has_expressive is False:
213
+ logger.warning(
214
+ "Passing 'expressive' but the agent does not support expressive output!"
215
+ )
216
+ # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
217
+ self.input_queue.put_nowait(segment)
218
+
219
+ def get_input_segment(self):
220
+ if self.input_queue.empty():
221
+ return None
222
+ chunk = self.input_queue.get_nowait()
223
+ self.input_queue.task_done()
224
+ return chunk
225
+
226
+ def convert_waveform(
227
+ self,
228
+ waveform: Union[np.ndarray, torch.Tensor],
229
+ sample_rate: int,
230
+ normalize_volume: bool = False,
231
+ to_mono: bool = False,
232
+ to_sample_rate: Optional[int] = None,
233
+ ) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
234
+ """convert a waveform:
235
+ - to a target sample rate
236
+ - from multi-channel to mono channel
237
+ - volume normalization
238
+
239
+ Args:
240
+ waveform (numpy.ndarray or torch.Tensor): 2D original waveform
241
+ (channels x length)
242
+ sample_rate (int): original sample rate
243
+ normalize_volume (bool): perform volume normalization
244
+ to_mono (bool): convert to mono channel if having multiple channels
245
+ to_sample_rate (Optional[int]): target sample rate
246
+ Returns:
247
+ waveform (numpy.ndarray): converted 2D waveform (channels x length)
248
+ sample_rate (float): target sample rate
249
+ """
250
+ try:
251
+ import torchaudio.sox_effects as ta_sox
252
+ except ImportError:
253
+ raise ImportError("Please install torchaudio: pip install torchaudio")
254
+
255
+ effects = []
256
+ if normalize_volume:
257
+ effects.append(["gain", "-n"])
258
+ if to_sample_rate is not None and to_sample_rate != sample_rate:
259
+ effects.append(["rate", f"{to_sample_rate}"])
260
+ if to_mono and waveform.shape[0] > 1:
261
+ effects.append(["channels", "1"])
262
+ if len(effects) > 0:
263
+ is_np_input = isinstance(waveform, np.ndarray)
264
+ _waveform = torch.from_numpy(waveform) if is_np_input else waveform
265
+ converted, converted_sample_rate = ta_sox.apply_effects_tensor(
266
+ _waveform, sample_rate, effects
267
+ )
268
+ if is_np_input:
269
+ converted = converted.numpy()
270
+ return converted, converted_sample_rate
271
+ return waveform, sample_rate
272
+
273
+ def _preprocess_wav(self, data: Any) -> Tuple[np.ndarray, int]:
274
+ segment, sample_rate = soundfile.read(
275
+ io.BytesIO(data),
276
+ dtype="float32",
277
+ always_2d=True,
278
+ frames=-1,
279
+ start=0,
280
+ format="RAW",
281
+ subtype="PCM_16",
282
+ samplerate=self.incoming_sample_rate,
283
+ channels=1,
284
+ )
285
+ if self.debug:
286
+ self.test_incoming_wav.seek(0, soundfile.SEEK_END)
287
+ self.test_incoming_wav.write(segment)
288
+
289
+ segment = segment.T
290
+ segment, new_sample_rate = self.convert_waveform(
291
+ segment,
292
+ sample_rate,
293
+ normalize_volume=False,
294
+ to_mono=True,
295
+ to_sample_rate=MODEL_SAMPLE_RATE,
296
+ )
297
+
298
+ assert MODEL_SAMPLE_RATE == new_sample_rate
299
+ segment = segment.squeeze(axis=0)
300
+ return segment, new_sample_rate
301
+
302
+ def process_pipeline_impl(self, input_segment):
303
+ try:
304
+ with torch.no_grad():
305
+ output_segment = OutputSegments(
306
+ self.agent.pushpop(input_segment, self.states)
307
+ )
308
+ if (
309
+ self.get_states_root().first_input_ts is not None
310
+ and self.first_input_ts is None
311
+ ):
312
+ # TODO: this is hacky
313
+ self.first_input_ts = self.get_states_root().first_input_ts
314
+
315
+ if not output_segment.is_empty:
316
+ self.output_queue.put_nowait(output_segment)
317
+
318
+ if output_segment.finished:
319
+ self.debug_log("OUTPUT SEGMENT IS FINISHED. Resetting states.")
320
+
321
+ self.reset_states()
322
+
323
+ if self.debug:
324
+ # when we rebuild states, this value is reset to whatever
325
+ # is in the system dir config, which defaults debug=False.
326
+ self.get_states_root().debug = True
327
+ except Exception as e:
328
+ logger.error(f"Got exception while processing pipeline: {e}")
329
+ traceback.print_exc()
330
+ return input_segment
331
+
332
+ def process_pipeline_loop(self):
333
+ if self.close:
334
+ return # closes the thread
335
+
336
+ self.debug_log("processing_pipeline")
337
+ while not self.close:
338
+ input_segment = self.get_input_segment()
339
+ if input_segment is None:
340
+ if self.get_states_root().is_fresh_state: # TODO: this is hacky
341
+ time.sleep(0.3)
342
+ else:
343
+ time.sleep(0.03)
344
+ continue
345
+ self.process_pipeline_impl(input_segment)
346
+ self.debug_log("finished processing_pipeline")
347
+
348
+ def process_pipeline_once(self):
349
+ if self.close:
350
+ return
351
+
352
+ self.debug_log("processing pipeline once")
353
+ input_segment = self.get_input_segment()
354
+ if input_segment is None:
355
+ return
356
+ self.process_pipeline_impl(input_segment)
357
+ self.debug_log("finished processing_pipeline_once")
358
+
359
+ def get_output_segment(self):
360
+ if self.output_queue.empty():
361
+ return None
362
+
363
+ output_chunk = self.output_queue.get_nowait()
364
+ self.output_queue.task_done()
365
+ return output_chunk
366
+
367
+ def start(self):
368
+ self.debug_log("starting transcoder in a thread")
369
+ threading.Thread(target=self.process_pipeline_loop).start()
370
+
371
+ def first_translation_time(self):
372
+ return round((self.first_output_ts - self.first_input_ts) / 1000, 2)
373
+
374
+ def get_buffered_output(self) -> SpeechAndTextOutput:
375
+ now = time.time() * 1000
376
+ self.debug_log(f"get_buffered_output queue size: {self.output_queue.qsize()}")
377
+ while not self.output_queue.empty():
378
+ tmp_out = self.get_output_segment()
379
+ if tmp_out and tmp_out.compute_length(self.g2p) > 0:
380
+ if len(self.output_buffer) == 0:
381
+ self.last_output_ts = now
382
+ self._populate_output_buffer(tmp_out)
383
+ self._increment_output_buffer_size(tmp_out)
384
+
385
+ if tmp_out.finished:
386
+ self.debug_log("tmp_out.finished")
387
+ res = self._gather_output_buffer_data(final=True)
388
+ self.debug_log(f"gathered output data: {res}")
389
+ self.output_buffer = []
390
+ self.increment_output_buffer_size = 0
391
+ self.last_output_ts = now
392
+ self.first_output_ts = now
393
+ return res
394
+ else:
395
+ self.debug_log("tmp_out.compute_length is not > 0")
396
+
397
+ if len(self.output_buffer) > 0 and (
398
+ now - self.last_output_ts >= self.output_buffer_idle_ms
399
+ or self.output_buffer_cur_size >= self.output_buffer_size_limit
400
+ ):
401
+ self.debug_log(
402
+ "[get_buffered_output] output_buffer is not empty. getting res to return."
403
+ )
404
+ self.last_output_ts = now
405
+ res = self._gather_output_buffer_data(final=False)
406
+ self.debug_log(f"gathered output data: {res}")
407
+ self.output_buffer = []
408
+ self.output_buffer_phoneme_count = 0
409
+ self.first_output_ts = now
410
+ return res
411
+ else:
412
+ self.debug_log("[get_buffered_output] output_buffer is empty...")
413
+ return None
414
+
415
+ def _gather_output_buffer_data(self, final):
416
+ output = SpeechAndTextOutput()
417
+ output.final = final
418
+ output = OutputSegments.join_output_buffer(self.output_buffer, output)
419
+ return output
420
+
421
+ def _increment_output_buffer_size(self, segment: OutputSegments):
422
+ self.output_buffer_cur_size += segment.compute_length(self.g2p)
423
+
424
+ def _populate_output_buffer(self, segment: OutputSegments):
425
+ self.output_buffer.append(segment.segments)
426
+
427
+ def _compute_phoneme_count(self, string: str) -> int:
428
+ return len([x for x in self.g2p(string) if x != " "])
seamless_server/src/speech_and_text_output.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Provides a container to return both speech and text output from our model at the same time
2
+
3
+
4
+ class SpeechAndTextOutput:
5
+ def __init__(
6
+ self,
7
+ text: str = None,
8
+ speech_samples: list = None,
9
+ speech_sample_rate: float = None,
10
+ final: bool = False,
11
+ ):
12
+ self.text = text
13
+ self.speech_samples = speech_samples
14
+ self.speech_sample_rate = speech_sample_rate
15
+ self.final = final
seamless_server/src/transcoder_helpers.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ logger = logging.getLogger("socketio_server_pubsub")
4
+
5
+
6
+ def get_transcoder_output_events(transcoder) -> list:
7
+ speech_and_text_output = transcoder.get_buffered_output()
8
+ if speech_and_text_output is None:
9
+ logger.debug("No output from transcoder.get_buffered_output()")
10
+ return []
11
+
12
+ logger.debug(f"We DID get output from the transcoder! {speech_and_text_output}")
13
+
14
+ lat = None
15
+
16
+ events = []
17
+
18
+ if speech_and_text_output.speech_samples:
19
+ events.append(
20
+ {
21
+ "event": "translation_speech",
22
+ "payload": speech_and_text_output.speech_samples,
23
+ "sample_rate": speech_and_text_output.speech_sample_rate,
24
+ }
25
+ )
26
+
27
+ if speech_and_text_output.text:
28
+ events.append(
29
+ {
30
+ "event": "translation_text",
31
+ "payload": speech_and_text_output.text,
32
+ }
33
+ )
34
+
35
+ for e in events:
36
+ e["eos"] = speech_and_text_output.final
37
+
38
+ # if not latency_sent:
39
+ # lat = transcoder.first_translation_time()
40
+ # latency_sent = True
41
+ # to_send["latency"] = lat
42
+
43
+ return events
streaming-react-app/.eslintrc.cjs ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module.exports = {
2
+ root: true,
3
+ env: {browser: true, es2020: true},
4
+ extends: [
5
+ 'eslint:recommended',
6
+ 'plugin:@typescript-eslint/recommended',
7
+ 'plugin:react-hooks/recommended',
8
+ ],
9
+ ignorePatterns: ['dist', '.eslintrc.cjs'],
10
+ parser: '@typescript-eslint/parser',
11
+ plugins: ['react-refresh'],
12
+ rules: {
13
+ 'react-refresh/only-export-components': [
14
+ 'warn',
15
+ {allowConstantExport: true},
16
+ ],
17
+ },
18
+ };
streaming-react-app/.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+
15
+ # Editor directories and files
16
+ .vscode/*
17
+ !.vscode/extensions.json
18
+ .idea
19
+ .DS_Store
20
+ *.suo
21
+ *.ntvs*
22
+ *.njsproj
23
+ *.sln
24
+ *.sw?
streaming-react-app/README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 Streaming React App
2
+
3
+ ## Getting Started
4
+
5
+ This project uses the [Yarn Package Manager](https://yarnpkg.com/).
6
+
7
+ 1. `yarn` - Install project dependencies
8
+ 2. `yarn run dev` - Run the app with a development server that supports hot module reloading
9
+
10
+ NOTE: You will either need to provide the server URL via environment variable (you can use the `.env` file for this) or via a url param when you load the react app (example: `http://localhost:5173/?serverURL=localhost:8000`)
11
+
12
+ ## URL Parameters
13
+
14
+ You can provide URL parameters in order to change the behavior of the app. Those are documented in [URLParams.ts](src/URLParams.ts).
streaming-react-app/index.html ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="/src/assets/illy.svg"/>
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>Illy's translator</title>
8
+ </head>
9
+ <body>
10
+ <div id="root"></div>
11
+ <script type="module" src="/src/main.tsx"></script>
12
+ </body>
13
+ </html>
streaming-react-app/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
streaming-react-app/package.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "streaming-react-app",
3
+ "private": true,
4
+ "version": "0.0.14",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite --host --strictPort",
8
+ "build": "vite build",
9
+ "preview": "vite preview",
10
+ "clean:node-modules": "rm -rf node_modules/",
11
+ "ts-check": "tsc --noEmit",
12
+ "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
13
+ "prettier-check": "cd ../ && yarn run prettier-base --check streaming-react-app",
14
+ "signal": "concurrently --names \"TS,LINT,PRETTIER\" -c \"bgBlack.bold,bgRed.bold,bgCyan.bold\" \"yarn run ts-check\" \"yarn run lint\" \"yarn run prettier-check\""
15
+ },
16
+ "dependencies": {
17
+ "@emotion/react": "11.11.1",
18
+ "@emotion/styled": "11.11.0",
19
+ "@mui/icons-material": "5.14.3",
20
+ "@mui/material": "5.14.5",
21
+ "@react-three/drei": "^9.83.9",
22
+ "@react-three/fiber": "^8.14.1",
23
+ "@react-three/xr": "^5.7.1",
24
+ "amazon-cognito-identity-js": "^6.3.6",
25
+ "audiobuffer-to-wav": "^1.0.0",
26
+ "aws-sdk": "^2.1472.0",
27
+ "js-cookie": "^3.0.5",
28
+ "lodash": "4.17.21",
29
+ "react": "^18.2.0",
30
+ "react-dom": "^18.2.0",
31
+ "react-google-charts": "^4.0.1",
32
+ "socket.io-client": "^4.7.2",
33
+ "three": "^0.156.1",
34
+ "three-mesh-ui": "^6.5.4",
35
+ "uuid": "^9.0.0",
36
+ "zustand": "^4.4.3"
37
+ },
38
+ "devDependencies": {
39
+ "@types/node": "^20.5.3",
40
+ "@types/react": "^18.2.15",
41
+ "@types/react-dom": "^18.2.7",
42
+ "@types/uuid": "^9.0.2",
43
+ "@typescript-eslint/eslint-plugin": "^6.0.0",
44
+ "@typescript-eslint/parser": "^6.0.0",
45
+ "@vitejs/plugin-react": "^4.0.3",
46
+ "concurrently": "8.2.1",
47
+ "eslint": "^8.45.0",
48
+ "eslint-plugin-react-hooks": "^4.6.0",
49
+ "eslint-plugin-react-refresh": "^0.4.3",
50
+ "typescript": "5.1.6",
51
+ "vite": "^4.4.5"
52
+ }
53
+ }
streaming-react-app/src/App.tsx ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import SocketWrapper from './SocketWrapper';
2
+ import {ThemeProvider} from '@mui/material/styles';
3
+ import theme from './theme';
4
+ import StreamingInterface from './StreamingInterface';
5
+ import CssBaseline from '@mui/material/CssBaseline';
6
+ import {createContext, useCallback, useState} from 'react';
7
+ import packageJson from '../package.json';
8
+
9
+ console.log(`Streaming React App version: ${packageJson?.version}`);
10
+
11
+ // Roboto font for mui ui library
12
+ // import '@fontsource/roboto/300.css';
13
+ // import '@fontsource/roboto/400.css';
14
+ // import '@fontsource/roboto/500.css';
15
+ // import '@fontsource/roboto/700.css';
16
+
17
+ export const AppResetKeyContext = createContext<(newKey: string) => void>(
18
+ () => {
19
+ throw new Error('AppResetKeyContext not initialized');
20
+ },
21
+ );
22
+
23
+ function App() {
24
+ return (
25
+ <ThemeProvider theme={theme}>
26
+ <CssBaseline />
27
+ <SocketWrapper>
28
+ <StreamingInterface />
29
+ </SocketWrapper>
30
+ </ThemeProvider>
31
+ );
32
+ }
33
+
34
+ function AppWrapper() {
35
+ const [appResetKey, setAppResetKey] = useState<string>('[initial value]');
36
+ const setAppResetKeyHandler = useCallback((newKey: string) => {
37
+ setAppResetKey((prev) => {
38
+ console.warn(
39
+ `Resetting the app with appResetKey: ${newKey}; prevKey: ${prev}`,
40
+ );
41
+ if (prev === newKey) {
42
+ console.error(
43
+ `The appResetKey was the same as the previous key, so the app will not reset.`,
44
+ );
45
+ }
46
+ return newKey;
47
+ });
48
+ }, []);
49
+
50
+ return (
51
+ <AppResetKeyContext.Provider value={setAppResetKeyHandler}>
52
+ <App key={appResetKey} />
53
+ </AppResetKeyContext.Provider>
54
+ );
55
+ }
56
+
57
+ export default AppWrapper;
streaming-react-app/src/Blink.tsx ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Box from '@mui/material/Box';
2
+ import {useEffect, useState} from 'react';
3
+
4
+ type Props = {
5
+ intervalMs: number;
6
+ children: React.ReactNode;
7
+ shouldBlink: boolean;
8
+ // display?: 'block' | 'inline' | 'inline-block';
9
+ };
10
+
11
+ export default function Blink({
12
+ // display = 'inline-block',
13
+ shouldBlink,
14
+ intervalMs,
15
+ children,
16
+ }: Props): React.ReactElement {
17
+ const [cursorBlinkOn, setCursorBlinkOn] = useState(false);
18
+
19
+ useEffect(() => {
20
+ if (shouldBlink) {
21
+ const interval = setInterval(() => {
22
+ setCursorBlinkOn((prev) => !prev);
23
+ }, intervalMs);
24
+
25
+ return () => clearInterval(interval);
26
+ } else {
27
+ setCursorBlinkOn(false);
28
+ }
29
+ }, [intervalMs, shouldBlink]);
30
+
31
+ return (
32
+ <Box
33
+ component="span"
34
+ sx={{
35
+ display: 'inline-block',
36
+ visibility: cursorBlinkOn ? 'visible' : 'hidden',
37
+ }}>
38
+ {children}
39
+ </Box>
40
+ );
41
+ }
streaming-react-app/src/DebugSection.tsx ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {Chart} from 'react-google-charts';
2
+ import debug from './debug';
3
+ import {
4
+ Accordion,
5
+ AccordionDetails,
6
+ AccordionSummary,
7
+ Button,
8
+ Typography,
9
+ } from '@mui/material';
10
+ import {useState} from 'react';
11
+ import ArrowDropDownIcon from '@mui/icons-material/ArrowDropDown';
12
+
13
+ export default function DebugChart() {
14
+ const [showDebugTimings, setShowDebugTimings] = useState<boolean>(false);
15
+
16
+ const data = debug()?.getChartData();
17
+ const options = {
18
+ timeline: {
19
+ groupByRowLabel: true,
20
+ },
21
+ };
22
+
23
+ return (
24
+ <div className="horizontal-padding-sra text-chunk-sra">
25
+ <Accordion
26
+ expanded={showDebugTimings}
27
+ onChange={() => setShowDebugTimings(!showDebugTimings)}
28
+ elevation={0}
29
+ sx={{border: 1, borderColor: 'rgba(0, 0, 0, 0.3)'}}>
30
+ <AccordionSummary
31
+ expandIcon={<ArrowDropDownIcon />}
32
+ className="debug-section">
33
+ Debug Info
34
+ </AccordionSummary>
35
+ <AccordionDetails>
36
+ {data && data.length > 1 ? (
37
+ <>
38
+ <Chart
39
+ chartType="Timeline"
40
+ data={data}
41
+ width="100%"
42
+ height="400px"
43
+ options={options}
44
+ />
45
+ <Button
46
+ variant="contained"
47
+ sx={{marginBottom: 1}}
48
+ onClick={() => {
49
+ debug()?.downloadInputAudio();
50
+ debug()?.downloadOutputAudio();
51
+ }}>
52
+ Download Input / Ouput Audio
53
+ </Button>
54
+ </>
55
+ ) : (
56
+ <Typography>No input / output detected</Typography>
57
+ )}
58
+ </AccordionDetails>
59
+ </Accordion>
60
+ </div>
61
+ );
62
+ }
streaming-react-app/src/RoomConfig.tsx ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Stack from '@mui/material/Stack';
2
+ import TextField from '@mui/material/TextField';
3
+ import {isValidRoomID, isValidPartialRoomID} from './generateNewRoomID';
4
+ import {useCallback, useEffect, useState} from 'react';
5
+ import Button from '@mui/material/Button';
6
+ import {useSocket} from './useSocket';
7
+ import FormGroup from '@mui/material/FormGroup';
8
+ import FormControlLabel from '@mui/material/FormControlLabel';
9
+ import Checkbox from '@mui/material/Checkbox';
10
+ import {RoomState} from './types/RoomState';
11
+ import setURLParam from './setURLParam';
12
+ import {getURLParams} from './URLParams';
13
+ import {
14
+ JoinRoomConfig,
15
+ Roles,
16
+ ServerState,
17
+ StreamingStatus,
18
+ } from './types/StreamingTypes';
19
+ import Alert from '@mui/material/Alert';
20
+
21
+ function capitalize(str: string): string {
22
+ return str.charAt(0).toUpperCase() + str.slice(1);
23
+ }
24
+
25
+ type Props = {
26
+ roomState: RoomState | null;
27
+ serverState: ServerState | null;
28
+ onJoinRoomOrUpdateRoles?: () => void;
29
+ streamingStatus: StreamingStatus;
30
+ };
31
+
32
+ export default function RoomConfig({
33
+ roomState,
34
+ serverState,
35
+ onJoinRoomOrUpdateRoles,
36
+ streamingStatus,
37
+ }: Props) {
38
+ const {socket, clientID} = useSocket();
39
+
40
+ const urlParams = getURLParams();
41
+ const roomIDParam = urlParams.roomID;
42
+ const autoJoinRoom = urlParams.autoJoin;
43
+
44
+ const [roomID, setRoomID] = useState<string>(
45
+ (roomIDParam ?? '').toUpperCase(),
46
+ );
47
+ const [roomIDError, setRoomIDError] = useState<boolean>(false);
48
+ const [roles, setRoles] = useState<{speaker: boolean; listener: boolean}>({
49
+ speaker: true,
50
+ listener: true,
51
+ });
52
+ const [lockServer, setLockServer] = useState<boolean>(false);
53
+ const [lockServerName, setLockServerName] = useState<string>('');
54
+
55
+ const [joinInProgress, setJoinInProgress] = useState<boolean>(false);
56
+ const [didAttemptAutoJoin, setDidAttemptAutoJoin] = useState<boolean>(false);
57
+
58
+ const isValidServerLock =
59
+ lockServer === false ||
60
+ (lockServerName != null && lockServerName.length > 0);
61
+ const isValidRoles = Object.values(roles).filter(Boolean).length > 0;
62
+ const isValidAllInputs =
63
+ isValidRoomID(roomID) && isValidRoles && isValidServerLock;
64
+ const roomIDFromServer = roomState?.room_id ?? null;
65
+
66
+ const onJoinRoom = useCallback(
67
+ (createNewRoom: boolean) => {
68
+ if (socket == null) {
69
+ console.error('Socket is null, cannot join room');
70
+ return;
71
+ }
72
+ console.debug(`Attempting to join roomID ${roomID}...`);
73
+
74
+ const lockServerValidated: string | null =
75
+ lockServer && roles['speaker'] ? lockServerName : null;
76
+
77
+ setJoinInProgress(true);
78
+
79
+ const configObject: JoinRoomConfig = {
80
+ roles: (Object.keys(roles) as Array<Roles>).filter(
81
+ (role) => roles[role] === true,
82
+ ),
83
+ lockServerName: lockServerValidated,
84
+ };
85
+
86
+ socket.emit(
87
+ 'join_room',
88
+ clientID,
89
+ //createNewRoom ? null : roomID,
90
+ true,
91
+ configObject,
92
+ (result) => {
93
+ console.log('join_room result:', result);
94
+ if (createNewRoom) {
95
+ setRoomID(result.roomID);
96
+ }
97
+ if (onJoinRoomOrUpdateRoles != null) {
98
+ onJoinRoomOrUpdateRoles();
99
+ }
100
+ setURLParam('roomID', result.roomID);
101
+ setJoinInProgress(false);
102
+ },
103
+ );
104
+ },
105
+ [
106
+ clientID,
107
+ lockServer,
108
+ lockServerName,
109
+ onJoinRoomOrUpdateRoles,
110
+ roles,
111
+ roomID,
112
+ socket,
113
+ ],
114
+ );
115
+
116
+ useEffect(() => {
117
+ if (
118
+ autoJoinRoom === true &&
119
+ didAttemptAutoJoin === false &&
120
+ socket != null
121
+ ) {
122
+ // We want to consider this an attempt whether or not we actually try to join, because
123
+ // we only want auto-join to happen on initial load
124
+ setDidAttemptAutoJoin(true);
125
+ if (
126
+ isValidAllInputs &&
127
+ joinInProgress === false &&
128
+ roomIDFromServer == null
129
+ ) {
130
+ console.debug('Attempting to auto-join room...');
131
+
132
+ onJoinRoom(false);
133
+ } else {
134
+ console.debug('Unable to auto-join room', {
135
+ isValidAllInputs,
136
+ joinInProgress,
137
+ roomIDFromServer,
138
+ });
139
+ }
140
+ }
141
+ }, [
142
+ autoJoinRoom,
143
+ didAttemptAutoJoin,
144
+ isValidAllInputs,
145
+ joinInProgress,
146
+ onJoinRoom,
147
+ roomIDFromServer,
148
+ socket,
149
+ ]);
150
+
151
+ return (
152
+ <Stack direction="column" spacing="12px">
153
+ <Stack direction="row" spacing="12px" sx={{alignItems: 'center'}}>
154
+ {/* <TextField
155
+ size="small"
156
+ label="Room Code"
157
+ variant="outlined"
158
+ disabled={roomState?.room_id != null}
159
+ value={roomID}
160
+ error={roomIDError}
161
+ onChange={(e) => {
162
+ const id = e.target.value.toUpperCase();
163
+ if (isValidPartialRoomID(id)) {
164
+ setRoomIDError(false);
165
+ setRoomID(id);
166
+ } else {
167
+ setRoomIDError(true);
168
+ }
169
+ }}
170
+ sx={{width: '8em'}}
171
+ /> */}
172
+
173
+ {/* <div>
174
+ <Button
175
+ variant="contained"
176
+ disabled={
177
+ isValidAllInputs === false ||
178
+ joinInProgress ||
179
+ streamingStatus !== 'stopped'
180
+ }
181
+ onClick={() => onJoinRoom(false)}>
182
+ {roomState?.room_id != null ? 'Update Roles' : 'Join Room'}
183
+ </Button>
184
+ </div> */}
185
+
186
+ {roomState?.room_id == null && (
187
+ <div>
188
+ <Button
189
+ variant="contained"
190
+ disabled={
191
+ roomState?.room_id != null ||
192
+ joinInProgress ||
193
+ streamingStatus !== 'stopped'
194
+ }
195
+ onClick={() => onJoinRoom(true)}>
196
+ {'Start translating'}
197
+ </Button>
198
+ </div>
199
+ )}
200
+ </Stack>
201
+
202
+ <FormGroup>
203
+ {Object.keys(roles).map((role) => {
204
+ return (
205
+ <FormControlLabel
206
+ disabled={streamingStatus !== 'stopped'}
207
+ key={role}
208
+ control={
209
+ <Checkbox
210
+ checked={roles[role]}
211
+ onChange={(event: React.ChangeEvent<HTMLInputElement>) => {
212
+ setRoles((prevRoles) => ({
213
+ ...prevRoles,
214
+ [role]: event.target.checked,
215
+ }));
216
+ }}
217
+ />
218
+ }
219
+ label={capitalize(role)}
220
+ />
221
+ );
222
+ })}
223
+
224
+ {urlParams.enableServerLock && roles['speaker'] === true && (
225
+ <>
226
+ <FormControlLabel
227
+ disabled={streamingStatus !== 'stopped'}
228
+ control={
229
+ <Checkbox
230
+ checked={lockServer}
231
+ onChange={(event: React.ChangeEvent<HTMLInputElement>) => {
232
+ setLockServer(event.target.checked);
233
+ }}
234
+ />
235
+ }
236
+ label="Lock Server (prevent other users from streaming)"
237
+ />
238
+ </>
239
+ )}
240
+ </FormGroup>
241
+
242
+ {urlParams.enableServerLock &&
243
+ roles['speaker'] === true &&
244
+ lockServer && (
245
+ <TextField
246
+ disabled={streamingStatus !== 'stopped'}
247
+ label="Enter Your Name + Expected Lock End Time"
248
+ variant="outlined"
249
+ value={lockServerName}
250
+ onChange={(event: React.ChangeEvent<HTMLInputElement>) => {
251
+ setLockServerName(event.target.value);
252
+ }}
253
+ helperText="Locking the server will prevent anyone else from using it until you close the page, in order to maximize server performance. Please only use this for live demos."
254
+ />
255
+ )}
256
+
257
+ {serverState?.serverLock != null &&
258
+ serverState.serverLock.clientID === clientID && (
259
+ <Alert severity="success">{`The server is now locked for your use (${serverState?.serverLock?.name}). Close this window to release the lock so that others may use the server.`}</Alert>
260
+ )}
261
+ </Stack>
262
+ );
263
+ }
streaming-react-app/src/SocketWrapper.tsx ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useContext, useEffect, useMemo, useRef, useState} from 'react';
2
+ import socketIOClient, {Socket} from 'socket.io-client';
3
+ import useStable from './useStable';
4
+ import {v4 as uuidv4} from 'uuid';
5
+ import {SocketContext} from './useSocket';
6
+ import {AppResetKeyContext} from './App';
7
+ import Backdrop from '@mui/material/Backdrop';
8
+ import CircularProgress from '@mui/material/CircularProgress';
9
+ import Typography from '@mui/material/Typography';
10
+ import {getURLParams} from './URLParams';
11
+
12
+ // The time to wait before showing a "disconnected" screen upon initial app load
13
+ const INITIAL_DISCONNECT_SCREEN_DELAY = 2000;
14
+ const SERVER_URL_DEFAULT = `${window.location.protocol === "https:" ? "wss" : "ws"
15
+ }://${window.location.host}`;
16
+
17
+ export default function SocketWrapper({children}) {
18
+ const [socket, setSocket] = useState<Socket | null>(null);
19
+ const [connected, setConnected] = useState<boolean | null>(null);
20
+ // Default to true:
21
+ const [willAttemptReconnect] = useState<boolean>(true);
22
+ const serverIDRef = useRef<string | null>(null);
23
+
24
+ const setAppResetKey = useContext(AppResetKeyContext);
25
+
26
+ /**
27
+ * Previously we had stored the clientID in local storage, but in that case
28
+ * if a user refreshes their page they'll still have the same clientID, and
29
+ * will be put back into the same room, which may be confusing if they're trying
30
+ * to join a new room or reset the app interface. So now clientIDs persist only as
31
+ * long as the react app full lifecycle
32
+ */
33
+ const clientID = useStable<string>(() => {
34
+ const newID = uuidv4();
35
+ // Set the clientID in session storage so if the page reloads the person
36
+ // still retains their member/room config
37
+ return newID;
38
+ });
39
+
40
+ const socketObject = useMemo(
41
+ () => ({socket, clientID, connected: connected ?? false}),
42
+ [socket, clientID, connected],
43
+ );
44
+
45
+ useEffect(() => {
46
+ const queryParams = {
47
+ clientID: clientID,
48
+ };
49
+
50
+ const serverURLFromParams = getURLParams().serverURL;
51
+ const serverURL = serverURLFromParams ?? SERVER_URL_DEFAULT;
52
+
53
+ console.log(
54
+ `Opening socket connection to ${
55
+ serverURL?.length === 0 ? 'window.location.host' : serverURL
56
+ } with query params:`,
57
+ queryParams,
58
+ );
59
+
60
+ const newSocket: Socket = socketIOClient(serverURL, {
61
+ query: queryParams,
62
+ // Normally socket.io will fallback to http polling, but we basically never
63
+ // want that because that'd mean awful performance. It'd be better for the app
64
+ // to simply break in that case and not connect.
65
+ transports: ['websocket'],
66
+ path: '/ws/socket.io'
67
+ });
68
+
69
+ const onServerID = (serverID: string) => {
70
+ console.debug('Received server ID:', serverID);
71
+ if (serverIDRef.current != null) {
72
+ if (serverIDRef.current !== serverID) {
73
+ console.error(
74
+ 'Server ID changed. Resetting the app using the app key',
75
+ );
76
+ setAppResetKey(serverID);
77
+ }
78
+ }
79
+ serverIDRef.current = serverID;
80
+ };
81
+
82
+ newSocket.on('server_id', onServerID);
83
+
84
+ setSocket(newSocket);
85
+
86
+ return () => {
87
+ newSocket.off('server_id', onServerID);
88
+ console.log(
89
+ 'Closing socket connection in the useEffect cleanup function...',
90
+ );
91
+ newSocket.disconnect();
92
+ setSocket(null);
93
+ };
94
+ }, [clientID, setAppResetKey]);
95
+
96
+ useEffect(() => {
97
+ if (socket != null) {
98
+ const onAny = (eventName: string, ...args) => {
99
+ console.debug(`[event: ${eventName}] args:`, ...args);
100
+ };
101
+
102
+ socket.onAny(onAny);
103
+
104
+ return () => {
105
+ socket.offAny(onAny);
106
+ };
107
+ }
108
+ return () => {};
109
+ }, [socket]);
110
+
111
+ useEffect(() => {
112
+ if (socket != null) {
113
+ const onConnect = (...args) => {
114
+ console.debug('Connected to server with args:', ...args);
115
+ setConnected(true);
116
+ };
117
+
118
+ const onConnectError = (err) => {
119
+ console.error(`Connection error due to ${err.message}`);
120
+ };
121
+
122
+ const onDisconnect = (reason) => {
123
+ setConnected(false);
124
+ console.log(`Disconnected due to ${reason}`);
125
+ };
126
+
127
+ socket.on('connect', onConnect);
128
+ socket.on('connect_error', onConnectError);
129
+ socket.on('disconnect', onDisconnect);
130
+
131
+ return () => {
132
+ socket.off('connect', onConnect);
133
+ socket.off('connect_error', onConnectError);
134
+ socket.off('disconnect', onDisconnect);
135
+ };
136
+ }
137
+ }, [socket]);
138
+
139
+ useEffect(() => {
140
+ if (socket != null) {
141
+ const onReconnectError = (err) => {
142
+ console.log(`Reconnect error due to ${err.message}`);
143
+ };
144
+
145
+ socket.io.on('reconnect_error', onReconnectError);
146
+
147
+ const onError = (err) => {
148
+ console.log(`General socket error with message ${err.message}`);
149
+ };
150
+ socket.io.on('error', onError);
151
+
152
+ const onReconnect = (attempt) => {
153
+ console.log(`Reconnected after ${attempt} attempt(s)`);
154
+ };
155
+ socket.io.on('reconnect', onReconnect);
156
+
157
+ const disconnectOnBeforeUnload = () => {
158
+ console.log('Disconnecting due to beforeunload event...');
159
+ socket.disconnect();
160
+ setSocket(null);
161
+ };
162
+ window.addEventListener('beforeunload', disconnectOnBeforeUnload);
163
+
164
+ return () => {
165
+ socket.io.off('reconnect_error', onReconnectError);
166
+ socket.io.off('error', onError);
167
+ socket.io.off('reconnect', onReconnect);
168
+ window.removeEventListener('beforeunload', disconnectOnBeforeUnload);
169
+ };
170
+ }
171
+ }, [clientID, setAppResetKey, socket]);
172
+
173
+ /**
174
+ * Wait to show the disconnected screen on initial app load
175
+ */
176
+ useEffect(() => {
177
+ window.setTimeout(() => {
178
+ setConnected((prev) => {
179
+ if (prev === null) {
180
+ return false;
181
+ }
182
+ return prev;
183
+ });
184
+ }, INITIAL_DISCONNECT_SCREEN_DELAY);
185
+ }, []);
186
+
187
+ return (
188
+ <SocketContext.Provider value={socketObject}>
189
+ {children}
190
+
191
+ <Backdrop
192
+ open={connected === false && willAttemptReconnect === true}
193
+ sx={{
194
+ color: '#fff',
195
+ zIndex: (theme) => theme.zIndex.drawer + 1,
196
+ }}>
197
+ <div
198
+ style={{
199
+ alignItems: 'center',
200
+ flexDirection: 'column',
201
+ textAlign: 'center',
202
+ }}>
203
+ <CircularProgress color="inherit" />
204
+ <Typography
205
+ align="center"
206
+ fontSize={{sm: 18, xs: 16}}
207
+ sx={{
208
+ fontFamily:
209
+ 'ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace',
210
+ fontWeight: 'bold',
211
+ }}>
212
+ {'Disconnected. Attempting to reconnect...'}
213
+ </Typography>
214
+ </div>
215
+ </Backdrop>
216
+ </SocketContext.Provider>
217
+ );
218
+ }
streaming-react-app/src/StreamingInterface.css ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .app-wrapper-sra {
2
+ display: flex;
3
+ flex-direction: column;
4
+ justify-content: center;
5
+ align-items: center;
6
+ }
7
+
8
+ .main-container-sra {
9
+ background-color: white;
10
+ display: flex;
11
+ flex-direction: column;
12
+ justify-content: flex-start;
13
+ text-align: left;
14
+ margin: 16px;
15
+ margin-bottom: 36px;
16
+ border-radius: 8px;
17
+ box-shadow: 0px 24px 30px rgba(0, 0, 0, 0.3);
18
+ border: 1px solid rgba(0, 0, 0, 0.05);
19
+ overflow: hidden;
20
+ }
21
+
22
+ .top-section-sra {
23
+ padding-top: 24px;
24
+ margin-bottom: 24px;
25
+ display: flex;
26
+ flex-direction: column;
27
+ justify-content: flex-start;
28
+ }
29
+
30
+ .horizontal-padding-sra {
31
+ padding-left: 20px;
32
+ padding-right: 20px;
33
+ }
34
+
35
+ .header-container-sra {
36
+ display: flex;
37
+ flex-direction: row;
38
+ justify-content: flex-start;
39
+ align-items: center;
40
+ margin-bottom: 24px;
41
+ }
42
+
43
+ .header-icon-sra {
44
+ display: block;
45
+ margin-right: 12px;
46
+ }
47
+
48
+ .translation-text-container-sra {
49
+ background-color: #f8f8f8;
50
+ padding-top: 12px;
51
+ padding-bottom: 4px;
52
+ }
53
+
54
+ .text-chunk-sra {
55
+ margin-bottom: 12px;
56
+ }
streaming-react-app/src/StreamingInterface.tsx ADDED
@@ -0,0 +1,1221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useCallback, useEffect, useLayoutEffect, useRef, useState} from 'react';
2
+ import Button from '@mui/material/Button';
3
+ import Typography from '@mui/material/Typography';
4
+ import InputLabel from '@mui/material/InputLabel';
5
+ import FormControl from '@mui/material/FormControl';
6
+ import Select, {SelectChangeEvent} from '@mui/material/Select';
7
+ import MenuItem from '@mui/material/MenuItem';
8
+ import Stack from '@mui/material/Stack';
9
+ import seamlessLogoUrl from './assets/illy.svg';
10
+ import {
11
+ AgentCapabilities,
12
+ BaseResponse,
13
+ BrowserAudioStreamConfig,
14
+ DynamicConfig,
15
+ PartialDynamicConfig,
16
+ SUPPORTED_INPUT_SOURCES,
17
+ SUPPORTED_OUTPUT_MODES,
18
+ ServerExceptionData,
19
+ ServerSpeechData,
20
+ ServerState,
21
+ ServerTextData,
22
+ StartStreamEventConfig,
23
+ StreamingStatus,
24
+ SupportedInputSource,
25
+ SupportedOutputMode,
26
+ TranslationSentences,
27
+ } from './types/StreamingTypes';
28
+ import FormLabel from '@mui/material/FormLabel';
29
+ import RadioGroup from '@mui/material/RadioGroup';
30
+ import FormControlLabel from '@mui/material/FormControlLabel';
31
+ import Radio from '@mui/material/Radio';
32
+ import './StreamingInterface.css';
33
+ import RoomConfig from './RoomConfig';
34
+ import Divider from '@mui/material/Divider';
35
+ import {useSocket} from './useSocket';
36
+ import {RoomState} from './types/RoomState';
37
+ import useStable from './useStable';
38
+ import float32To16BitPCM from './float32To16BitPCM';
39
+ import createBufferedSpeechPlayer from './createBufferedSpeechPlayer';
40
+ import Checkbox from '@mui/material/Checkbox';
41
+ import Alert from '@mui/material/Alert';
42
+ import isScrolledToDocumentBottom from './isScrolledToDocumentBottom';
43
+ import Box from '@mui/material/Box';
44
+ import Slider from '@mui/material/Slider';
45
+ import VolumeDown from '@mui/icons-material/VolumeDown';
46
+ import VolumeUp from '@mui/icons-material/VolumeUp';
47
+ import Mic from '@mui/icons-material/Mic';
48
+ import MicOff from '@mui/icons-material/MicOff';
49
+ import XRDialog from './react-xr/XRDialog';
50
+ import getTranslationSentencesFromReceivedData from './getTranslationSentencesFromReceivedData';
51
+ import {
52
+ sliceTranslationSentencesUpToIndex,
53
+ getTotalSentencesLength,
54
+ } from './sliceTranslationSentencesUtils';
55
+ import Blink from './Blink';
56
+ import {CURSOR_BLINK_INTERVAL_MS} from './cursorBlinkInterval';
57
+ import {getURLParams} from './URLParams';
58
+ import debug from './debug';
59
+ import DebugSection from './DebugSection';
60
+ import Switch from '@mui/material/Switch';
61
+ import Grid from '@mui/material/Grid';
62
+ import {getLanguageFromThreeLetterCode} from './languageLookup';
63
+ import HeadphonesIcon from '@mui/icons-material/Headphones';
64
+
65
+ const AUDIO_STREAM_DEFAULTS = {
66
+ userMedia: {
67
+ echoCancellation: false,
68
+ noiseSuppression: true,
69
+ },
70
+ displayMedia: {
71
+ echoCancellation: false,
72
+ noiseSuppression: false,
73
+ },
74
+ } as const;
75
+
76
+ async function requestUserMediaAudioStream(
77
+ config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['userMedia'],
78
+ ) {
79
+ const stream = await navigator.mediaDevices.getUserMedia({
80
+ audio: {...config, channelCount: 1},
81
+ });
82
+ console.debug(
83
+ '[requestUserMediaAudioStream] stream created with settings:',
84
+ stream.getAudioTracks()?.[0]?.getSettings(),
85
+ );
86
+ return stream;
87
+ }
88
+
89
+ async function requestDisplayMediaAudioStream(
90
+ config: BrowserAudioStreamConfig = AUDIO_STREAM_DEFAULTS['displayMedia'],
91
+ ) {
92
+ const stream = await navigator.mediaDevices.getDisplayMedia({
93
+ audio: {...config, channelCount: 1},
94
+ });
95
+ console.debug(
96
+ '[requestDisplayMediaAudioStream] stream created with settings:',
97
+ stream.getAudioTracks()?.[0]?.getSettings(),
98
+ );
99
+ return stream;
100
+ }
101
+
102
+ const buttonLabelMap: {[key in StreamingStatus]: string} = {
103
+ stopped: 'Start Streaming',
104
+ running: 'Stop Streaming',
105
+ starting: 'Starting...',
106
+ };
107
+
108
+ const BUFFER_LIMIT = 1;
109
+
110
+ const SCROLLED_TO_BOTTOM_THRESHOLD_PX = 36;
111
+
112
+ const GAIN_MULTIPLIER_OVER_1 = 3;
113
+
114
+ const getGainScaledValue = (value) =>
115
+ value > 1 ? (value - 1) * GAIN_MULTIPLIER_OVER_1 + 1 : value;
116
+
117
+ const TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD = 2;
118
+
119
+ const MAX_SERVER_EXCEPTIONS_TRACKED = 500;
120
+
121
+ export const TYPING_ANIMATION_DELAY_MS = 6;
122
+
123
+ export default function StreamingInterface() {
124
+ const urlParams = getURLParams();
125
+ const debugParam = urlParams.debug;
126
+ const [animateTextDisplay, setAnimateTextDisplay] = useState<boolean>(
127
+ urlParams.animateTextDisplay,
128
+ );
129
+
130
+ const socketObject = useSocket();
131
+ const {socket, clientID} = socketObject;
132
+
133
+ const [serverState, setServerState] = useState<ServerState | null>(null);
134
+ const [agent, setAgent] = useState<AgentCapabilities | null>(null);
135
+ const model = agent?.name ?? null;
136
+ const agentsCapabilities: Array<AgentCapabilities> =
137
+ serverState?.agentsCapabilities ?? [];
138
+ const currentAgent: AgentCapabilities | null =
139
+ agentsCapabilities.find((agent) => agent.name === model) ?? null;
140
+
141
+ const [serverExceptions, setServerExceptions] = useState<
142
+ Array<ServerExceptionData>
143
+ >([]);
144
+ const [roomState, setRoomState] = useState<RoomState | null>(null);
145
+ const roomID = roomState?.room_id ?? null;
146
+ const isSpeaker =
147
+ (clientID != null && roomState?.speakers.includes(clientID)) ?? false;
148
+ const isListener =
149
+ (clientID != null && roomState?.listeners.includes(clientID)) ?? false;
150
+
151
+ const [streamingStatus, setStreamingStatus] =
152
+ useState<StreamingStatus>('stopped');
153
+
154
+ const isStreamConfiguredRef = useRef<boolean>(false);
155
+ const [hasMaxSpeakers, setHasMaxSpeakers] = useState<boolean>(false);
156
+
157
+ const [outputMode, setOutputMode] = useState<SupportedOutputMode>('s2s&t');
158
+ const [inputSource, setInputSource] =
159
+ useState<SupportedInputSource>('userMedia');
160
+ const [enableNoiseSuppression, setEnableNoiseSuppression] = useState<
161
+ boolean | null
162
+ >(null);
163
+ const [enableEchoCancellation, setEnableEchoCancellation] = useState<
164
+ boolean | null
165
+ >(null);
166
+
167
+ // Dynamic Params:
168
+ const [targetLang, setTargetLang] = useState<string | null>(null);
169
+ const [enableExpressive, setEnableExpressive] = useState<boolean | null>(
170
+ null,
171
+ );
172
+
173
+ const [serverDebugFlag, setServerDebugFlag] = useState<boolean>(
174
+ debugParam ?? false,
175
+ );
176
+
177
+ const [receivedData, setReceivedData] = useState<Array<ServerTextData>>([]);
178
+ const [
179
+ translationSentencesAnimatedIndex,
180
+ setTranslationSentencesAnimatedIndex,
181
+ ] = useState<number>(0);
182
+
183
+ const lastTranslationResultRef = useRef<HTMLDivElement | null>(null);
184
+
185
+ const [inputStream, setInputStream] = useState<MediaStream | null>(null);
186
+ const [inputStreamSource, setInputStreamSource] =
187
+ useState<MediaStreamAudioSourceNode | null>(null);
188
+ const audioContext = useStable<AudioContext>(() => new AudioContext());
189
+ const [scriptNodeProcessor, setScriptNodeProcessor] =
190
+ useState<ScriptProcessorNode | null>(null);
191
+
192
+ const [muted, setMuted] = useState<boolean>(false);
193
+ // The onaudioprocess script needs an up-to-date reference to the muted state, so
194
+ // we use a ref here and keep it in sync via useEffect
195
+ const mutedRef = useRef<boolean>(muted);
196
+ useEffect(() => {
197
+ mutedRef.current = muted;
198
+ }, [muted]);
199
+
200
+ const [gain, setGain] = useState<number>(1);
201
+
202
+ const isScrolledToBottomRef = useRef<boolean>(isScrolledToDocumentBottom());
203
+
204
+ // Some config options must be set when starting streaming and cannot be chaned dynamically.
205
+ // This controls whether they are disabled or not
206
+ const streamFixedConfigOptionsDisabled =
207
+ streamingStatus !== 'stopped' || roomID == null;
208
+
209
+ const bufferedSpeechPlayer = useStable(() => {
210
+ const player = createBufferedSpeechPlayer({
211
+ onStarted: () => {
212
+ console.debug('📢 PLAYBACK STARTED 📢');
213
+ },
214
+ onEnded: () => {
215
+ console.debug('🛑 PLAYBACK ENDED 🛑');
216
+ },
217
+ });
218
+
219
+ // Start the player now so it eagerly plays audio when it arrives
220
+ player.start();
221
+ return player;
222
+ });
223
+
224
+ const translationSentencesBase: TranslationSentences =
225
+ getTranslationSentencesFromReceivedData(receivedData);
226
+
227
+ const translationSentencesBaseTotalLength = getTotalSentencesLength(
228
+ translationSentencesBase,
229
+ );
230
+
231
+ const translationSentences: TranslationSentences = animateTextDisplay
232
+ ? sliceTranslationSentencesUpToIndex(
233
+ translationSentencesBase,
234
+ translationSentencesAnimatedIndex,
235
+ )
236
+ : translationSentencesBase;
237
+
238
+ // We want the blinking cursor to show before any text has arrived, so let's add an empty string so that the cursor shows up
239
+ const translationSentencesWithEmptyStartingString =
240
+ streamingStatus === 'running' && translationSentences.length === 0
241
+ ? ['']
242
+ : translationSentences;
243
+
244
+ /******************************************
245
+ * Event Handlers
246
+ ******************************************/
247
+
248
+ const setAgentAndUpdateParams = useCallback(
249
+ (newAgent: AgentCapabilities | null) => {
250
+ setAgent((prevAgent) => {
251
+ if (prevAgent?.name !== newAgent?.name) {
252
+ setTargetLang(newAgent?.targetLangs[0] ?? null);
253
+ setEnableExpressive(null);
254
+ }
255
+ return newAgent;
256
+ });
257
+ },
258
+ [],
259
+ );
260
+
261
+ const onSetDynamicConfig = useCallback(
262
+ async (partialConfig: PartialDynamicConfig) => {
263
+ return new Promise<void>((resolve, reject) => {
264
+ if (socket == null) {
265
+ reject(new Error('[onSetDynamicConfig] socket is null '));
266
+ return;
267
+ }
268
+
269
+ socket.emit(
270
+ 'set_dynamic_config',
271
+ partialConfig,
272
+ (result: BaseResponse) => {
273
+ console.log('[emit result: set_dynamic_config]', result);
274
+ if (result.status === 'ok') {
275
+ resolve();
276
+ } else {
277
+ reject();
278
+ }
279
+ },
280
+ );
281
+ });
282
+ },
283
+ [socket],
284
+ );
285
+
286
+ const configureStreamAsync = ({sampleRate}: {sampleRate: number}) => {
287
+ return new Promise<void>((resolve, reject) => {
288
+ if (socket == null) {
289
+ reject(new Error('[configureStreamAsync] socket is null '));
290
+ return;
291
+ }
292
+ const modelName = agent?.name ?? null;
293
+ if (modelName == null) {
294
+ reject(new Error('[configureStreamAsync] modelName is null '));
295
+ return;
296
+ }
297
+
298
+ const config: StartStreamEventConfig = {
299
+ event: 'config',
300
+ rate: sampleRate,
301
+ model_name: modelName,
302
+ debug: serverDebugFlag,
303
+ // synchronous processing isn't implemented on the v2 pubsub server, so hardcode this to true
304
+ async_processing: true,
305
+ buffer_limit: BUFFER_LIMIT,
306
+ model_type: outputMode,
307
+ };
308
+
309
+ console.log('[configureStreamAsync] sending config', config);
310
+
311
+ socket.emit('configure_stream', config, (statusObject) => {
312
+ setHasMaxSpeakers(statusObject.message === 'max_speakers')
313
+ if (statusObject.status === 'ok') {
314
+ isStreamConfiguredRef.current = true;
315
+ console.debug(
316
+ '[configureStreamAsync] stream configured!',
317
+ statusObject,
318
+ );
319
+ resolve();
320
+ } else {
321
+ isStreamConfiguredRef.current = false;
322
+ reject(
323
+ new Error(
324
+ `[configureStreamAsync] configure_stream returned status: ${statusObject.status}`,
325
+ ),
326
+ );
327
+ return;
328
+ }
329
+ });
330
+ });
331
+ };
332
+
333
+ const startStreaming = async () => {
334
+ if (streamingStatus !== 'stopped') {
335
+ console.warn(
336
+ `Attempting to start stream when status is ${streamingStatus}`,
337
+ );
338
+ return;
339
+ }
340
+
341
+ setStreamingStatus('starting');
342
+
343
+ if (audioContext.state === 'suspended') {
344
+ console.warn('audioContext was suspended! resuming...');
345
+ await audioContext.resume();
346
+ }
347
+
348
+ let stream: MediaStream | null = null;
349
+
350
+ try {
351
+ if (inputSource === 'userMedia') {
352
+ stream = await requestUserMediaAudioStream({
353
+ noiseSuppression:
354
+ enableNoiseSuppression ??
355
+ AUDIO_STREAM_DEFAULTS['userMedia'].noiseSuppression,
356
+ echoCancellation:
357
+ enableEchoCancellation ??
358
+ AUDIO_STREAM_DEFAULTS['userMedia'].echoCancellation,
359
+ });
360
+ } else if (inputSource === 'displayMedia') {
361
+ stream = await requestDisplayMediaAudioStream({
362
+ noiseSuppression:
363
+ enableNoiseSuppression ??
364
+ AUDIO_STREAM_DEFAULTS['displayMedia'].noiseSuppression,
365
+ echoCancellation:
366
+ enableEchoCancellation ??
367
+ AUDIO_STREAM_DEFAULTS['displayMedia'].echoCancellation,
368
+ });
369
+ } else {
370
+ throw new Error(`Unsupported input source requested: ${inputSource}`);
371
+ }
372
+ setInputStream(stream);
373
+ } catch (e) {
374
+ console.error('[startStreaming] media stream request failed:', e);
375
+ setStreamingStatus('stopped');
376
+ return;
377
+ }
378
+
379
+ const mediaStreamSource = audioContext.createMediaStreamSource(stream);
380
+ setInputStreamSource(mediaStreamSource);
381
+ /**
382
+ * NOTE: This currently uses a deprecated way of processing the audio (createScriptProcessor), but
383
+ * which is easy and convenient for our purposes.
384
+ *
385
+ * Documentation for the deprecated way of doing it is here: https://developer.mozilla.org/en-US/docs/Web/API/BaseAudioContext/createScriptProcessor
386
+ *
387
+ * In an ideal world this would be migrated to something like this SO answer: https://stackoverflow.com/a/65448287
388
+ */
389
+ const scriptProcessor = audioContext.createScriptProcessor(16384, 1, 1);
390
+ setScriptNodeProcessor(scriptProcessor);
391
+
392
+ scriptProcessor.onaudioprocess = (event) => {
393
+ if (isStreamConfiguredRef.current === false) {
394
+ console.debug('[onaudioprocess] stream is not configured yet!');
395
+ return;
396
+ }
397
+ if (socket == null) {
398
+ console.warn('[onaudioprocess] socket is null in onaudioprocess');
399
+ return;
400
+ }
401
+
402
+ if (mutedRef.current) {
403
+ // We still want to send audio to the server when we're muted to ensure we
404
+ // get any remaining audio back from the server, so let's pass an array length 1 with a value of 0
405
+ const mostlyEmptyInt16Array = new Int16Array(1);
406
+ socket.emit('incoming_audio', mostlyEmptyInt16Array);
407
+ } else {
408
+ const float32Audio = event.inputBuffer.getChannelData(0);
409
+ const pcm16Audio = float32To16BitPCM(float32Audio);
410
+ socket.emit('incoming_audio', pcm16Audio);
411
+ }
412
+
413
+ debug()?.sentAudio(event);
414
+ };
415
+
416
+ mediaStreamSource.connect(scriptProcessor);
417
+ scriptProcessor.connect(audioContext.destination);
418
+
419
+ bufferedSpeechPlayer.start();
420
+
421
+ try {
422
+ if (targetLang == null) {
423
+ throw new Error('[startStreaming] targetLang cannot be nullish');
424
+ }
425
+
426
+ // When we are starting the stream we want to pass all the dynamic config values
427
+ // available before actually configuring and starting the stream
428
+ const fullDynamicConfig: DynamicConfig = {
429
+ targetLanguage: targetLang,
430
+ expressive: enableExpressive,
431
+ };
432
+
433
+ await onSetDynamicConfig(fullDynamicConfig);
434
+
435
+ // NOTE: this needs to be the *audioContext* sample rate, not the sample rate of the input stream. Not entirely sure why.
436
+ await configureStreamAsync({
437
+ sampleRate: audioContext.sampleRate,
438
+ });
439
+ } catch (e) {
440
+ console.error('configureStreamAsync failed', e);
441
+ setStreamingStatus('stopped');
442
+ return;
443
+ }
444
+
445
+ setStreamingStatus('running');
446
+ };
447
+
448
+ const stopStreaming = useCallback(async () => {
449
+ if (streamingStatus === 'stopped') {
450
+ console.warn(
451
+ `Attempting to stop stream when status is ${streamingStatus}`,
452
+ );
453
+ return;
454
+ }
455
+
456
+ // Stop the speech playback right away
457
+ bufferedSpeechPlayer.stop();
458
+
459
+ if (inputStreamSource == null || scriptNodeProcessor == null) {
460
+ console.error(
461
+ 'inputStreamSource || scriptNodeProcessor is null in stopStreaming',
462
+ );
463
+ } else {
464
+ inputStreamSource.disconnect(scriptNodeProcessor);
465
+ scriptNodeProcessor.disconnect(audioContext.destination);
466
+
467
+ // Release the mic input so we stop showing the red recording icon in the browser
468
+ inputStream?.getTracks().forEach((track) => track.stop());
469
+ }
470
+
471
+ if (socket == null) {
472
+ console.warn('Unable to emit stop_stream because socket is null');
473
+ } else {
474
+ socket.emit('stop_stream', (result) => {
475
+ console.debug('[emit result: stop_stream]', result);
476
+ });
477
+ }
478
+
479
+ setStreamingStatus('stopped');
480
+ }, [
481
+ audioContext.destination,
482
+ bufferedSpeechPlayer,
483
+ inputStream,
484
+ inputStreamSource,
485
+ scriptNodeProcessor,
486
+ socket,
487
+ streamingStatus,
488
+ ]);
489
+
490
+ const onClearTranscriptForAll = useCallback(() => {
491
+ if (socket != null) {
492
+ socket.emit('clear_transcript_for_all');
493
+ }
494
+ }, [socket]);
495
+
496
+ /******************************************
497
+ * Effects
498
+ ******************************************/
499
+
500
+ useEffect(() => {
501
+ if (socket == null) {
502
+ return;
503
+ }
504
+
505
+ const onRoomStateUpdate = (roomState: RoomState) => {
506
+ setRoomState(roomState);
507
+ };
508
+
509
+ socket.on('room_state_update', onRoomStateUpdate);
510
+
511
+ return () => {
512
+ socket.off('room_state_update', onRoomStateUpdate);
513
+ };
514
+ }, [socket]);
515
+
516
+ useEffect(() => {
517
+ if (socket != null) {
518
+ const onTranslationText = (data: ServerTextData) => {
519
+ setReceivedData((prev) => [...prev, data]);
520
+ debug()?.receivedText(data.payload);
521
+ };
522
+
523
+ const onTranslationSpeech = (data: ServerSpeechData) => {
524
+ bufferedSpeechPlayer.addAudioToBuffer(data.payload, data.sample_rate);
525
+ };
526
+
527
+ socket.on('translation_text', onTranslationText);
528
+ socket.on('translation_speech', onTranslationSpeech);
529
+
530
+ return () => {
531
+ socket.off('translation_text', onTranslationText);
532
+ socket.off('translation_speech', onTranslationSpeech);
533
+ };
534
+ }
535
+ }, [bufferedSpeechPlayer, socket]);
536
+
537
+ useEffect(() => {
538
+ if (socket != null) {
539
+ const onServerStateUpdate = (newServerState: ServerState) => {
540
+ setServerState(newServerState);
541
+
542
+ // If a client creates a server lock, we want to stop streaming if we're not them
543
+ if (
544
+ newServerState.serverLock?.isActive === true &&
545
+ newServerState.serverLock?.clientID !== clientID &&
546
+ streamingStatus === 'running'
547
+ ) {
548
+ stopStreaming();
549
+ }
550
+
551
+ const firstAgentNullable = newServerState.agentsCapabilities[0];
552
+ if (agent == null && firstAgentNullable != null) {
553
+ setAgentAndUpdateParams(firstAgentNullable);
554
+ }
555
+ };
556
+
557
+ socket.on('server_state_update', onServerStateUpdate);
558
+
559
+ return () => {
560
+ socket.off('server_state_update', onServerStateUpdate);
561
+ };
562
+ }
563
+ }, [
564
+ agent,
565
+ clientID,
566
+ setAgentAndUpdateParams,
567
+ socket,
568
+ stopStreaming,
569
+ streamingStatus,
570
+ ]);
571
+
572
+ useEffect(() => {
573
+ if (socket != null) {
574
+ const onServerException = (
575
+ exceptionDataWithoutClientTime: ServerExceptionData,
576
+ ) => {
577
+ const exceptionData = {
578
+ ...exceptionDataWithoutClientTime,
579
+ timeStringClient: new Date(
580
+ exceptionDataWithoutClientTime['timeEpochMs'],
581
+ ).toLocaleString(),
582
+ };
583
+
584
+ setServerExceptions((prev) =>
585
+ [exceptionData, ...prev].slice(0, MAX_SERVER_EXCEPTIONS_TRACKED),
586
+ );
587
+ console.error(
588
+ `[server_exception] The server encountered an exception: ${exceptionData['message']}`,
589
+ exceptionData,
590
+ );
591
+ };
592
+
593
+ socket.on('server_exception', onServerException);
594
+
595
+ return () => {
596
+ socket.off('server_exception', onServerException);
597
+ };
598
+ }
599
+ }, [socket]);
600
+
601
+ useEffect(() => {
602
+ if (socket != null) {
603
+ const onClearTranscript = () => {
604
+ setReceivedData([]);
605
+ setTranslationSentencesAnimatedIndex(0);
606
+ };
607
+
608
+ socket.on('clear_transcript', onClearTranscript);
609
+
610
+ return () => {
611
+ socket.off('clear_transcript', onClearTranscript);
612
+ };
613
+ }
614
+ }, [socket]);
615
+
616
+ useEffect(() => {
617
+ const onScroll = () => {
618
+ if (isScrolledToDocumentBottom(SCROLLED_TO_BOTTOM_THRESHOLD_PX)) {
619
+ isScrolledToBottomRef.current = true;
620
+ return;
621
+ }
622
+ isScrolledToBottomRef.current = false;
623
+ return;
624
+ };
625
+
626
+ document.addEventListener('scroll', onScroll);
627
+
628
+ return () => {
629
+ document.removeEventListener('scroll', onScroll);
630
+ };
631
+ }, []);
632
+
633
+ useLayoutEffect(() => {
634
+ if (
635
+ lastTranslationResultRef.current != null &&
636
+ isScrolledToBottomRef.current
637
+ ) {
638
+ // Scroll the div to the most recent entry
639
+ lastTranslationResultRef.current.scrollIntoView();
640
+ }
641
+ // Run the effect every time data is received, so that
642
+ // we scroll to the bottom even if we're just adding text to
643
+ // a pre-existing chunk
644
+ }, [receivedData]);
645
+
646
+ useEffect(() => {
647
+ if (!animateTextDisplay) {
648
+ return;
649
+ }
650
+
651
+ if (
652
+ translationSentencesAnimatedIndex < translationSentencesBaseTotalLength
653
+ ) {
654
+ const timeout = setTimeout(() => {
655
+ setTranslationSentencesAnimatedIndex((prev) => prev + 1);
656
+ debug()?.startRenderText();
657
+ }, TYPING_ANIMATION_DELAY_MS);
658
+
659
+ return () => clearTimeout(timeout);
660
+ } else {
661
+ debug()?.endRenderText();
662
+ }
663
+ }, [
664
+ animateTextDisplay,
665
+ translationSentencesAnimatedIndex,
666
+ translationSentencesBaseTotalLength,
667
+ ]);
668
+
669
+ /******************************************
670
+ * Sub-components
671
+ ******************************************/
672
+
673
+ const volumeSliderNode = (
674
+ <Stack
675
+ spacing={2}
676
+ direction="row"
677
+ sx={{mb: 1, width: '100%'}}
678
+ alignItems="center">
679
+ <VolumeDown color="primary" />
680
+ <Slider
681
+ aria-label="Volume"
682
+ defaultValue={1}
683
+ scale={getGainScaledValue}
684
+ min={0}
685
+ max={3}
686
+ step={0.1}
687
+ marks={[
688
+ {value: 0, label: '0%'},
689
+ {value: 1, label: '100%'},
690
+ {value: 2, label: '400%'},
691
+ {value: 3, label: '700%'},
692
+ ]}
693
+ valueLabelFormat={(value) => `${(value * 100).toFixed(0)}%`}
694
+ valueLabelDisplay="auto"
695
+ value={gain}
696
+ onChange={(_event: Event, newValue: number | number[]) => {
697
+ if (typeof newValue === 'number') {
698
+ const scaledGain = getGainScaledValue(newValue);
699
+ // We want the actual gain node to use the scaled value
700
+ bufferedSpeechPlayer.setGain(scaledGain);
701
+ // But we want react state to keep track of the non-scaled value
702
+ setGain(newValue);
703
+ } else {
704
+ console.error(
705
+ `[volume slider] Unexpected non-number value: ${newValue}`,
706
+ );
707
+ }
708
+ }}
709
+ />
710
+ <VolumeUp color="primary" />
711
+ </Stack>
712
+ );
713
+
714
+ const xrDialogComponent = (
715
+ <XRDialog
716
+ animateTextDisplay={
717
+ animateTextDisplay &&
718
+ translationSentencesAnimatedIndex == translationSentencesBaseTotalLength
719
+ }
720
+ bufferedSpeechPlayer={bufferedSpeechPlayer}
721
+ translationSentences={translationSentences}
722
+ roomState={roomState}
723
+ roomID={roomID}
724
+ startStreaming={startStreaming}
725
+ stopStreaming={stopStreaming}
726
+ debugParam={debugParam}
727
+ onARHidden={() => {
728
+ setAnimateTextDisplay(urlParams.animateTextDisplay);
729
+ }}
730
+ onARVisible={() => setAnimateTextDisplay(false)}
731
+ />
732
+ );
733
+
734
+ return (
735
+ <div className="app-wrapper-sra">
736
+ <Box
737
+ // eslint-disable-next-line @typescript-eslint/ban-ts-comment
738
+ // @ts-ignore Not sure why it's complaining about complexity here
739
+ sx={{width: '100%', maxWidth: '660px', minWidth: '320px'}}>
740
+ <div className="main-container-sra">
741
+ <div className="top-section-sra horizontal-padding-sra">
742
+ <div className="header-container-sra">
743
+ <img
744
+ src={seamlessLogoUrl}
745
+ className="header-icon-sra"
746
+ alt="Seamless Translation Logo"
747
+ height={150}
748
+ width={225}
749
+ />
750
+
751
+ <div>
752
+ <Typography variant="h1" sx={{color: '#800020'}}>
753
+ Illy's translator
754
+ </Typography>
755
+ <Typography variant="body2" sx={{color: '#800020'}}>
756
+ <span style={{ fontStyle: 'italic' }}>
757
+ Natale 2023
758
+ </span>
759
+ </Typography>
760
+ </div>
761
+ </div>
762
+ <div className="header-container-sra">
763
+ <div>
764
+ <Typography variant="body2" sx={{color: '#65676B'}}>
765
+ Hey <strong style={{ fontWeight: 'bold' }}>Illy</strong>, <strong style={{ fontWeight: 'bold' }}>it's lovely to see you!</strong>
766
+ <br/>
767
+ You can use this platform to translate from/to Italian and many some other languages.
768
+ <br/>
769
+ Use headphones if you are both speaker and listener to prevent feedback.
770
+ <br/>
771
+ <br/>
772
+ <a target="_blank" rel="noopener noreferrer" href="https://ai.meta.com/research/seamless-communication/">SeamlessStreaming</a> is
773
+ a research model and streaming quality works best if you pause
774
+ every couple of sentences. The real-time performance will degrade
775
+ if you try streaming multiple speakers at the same time.
776
+ <br/>
777
+ <br/>
778
+ Let's try!
779
+ </Typography>
780
+ </div>
781
+ </div>
782
+ <Stack spacing="22px" direction="column">
783
+ <Box>
784
+ { <RoomConfig
785
+ roomState={roomState}
786
+ serverState={serverState}
787
+ streamingStatus={streamingStatus}
788
+ onJoinRoomOrUpdateRoles={() => {
789
+ // If the user has switched from speaker to listener we need to tell the
790
+ // player to play eagerly, since currently the listener doesn't have any stop/start controls
791
+ bufferedSpeechPlayer.start();
792
+ }}
793
+ /> }
794
+
795
+ {isListener && !isSpeaker && (
796
+ <Box
797
+ sx={{
798
+ paddingX: 6,
799
+ paddingBottom: 2,
800
+ marginY: 2,
801
+ display: 'flex',
802
+ flexDirection: 'column',
803
+ alignItems: 'center',
804
+ }}>
805
+ {volumeSliderNode}
806
+ </Box>
807
+ )}
808
+ </Box>
809
+
810
+ {isSpeaker && (
811
+ <>
812
+ <Divider />
813
+
814
+ <Stack spacing="12px" direction="column">
815
+ {/* <FormLabel id="output-modes-radio-group-label">
816
+ Model
817
+ </FormLabel> */}
818
+ <FormControl
819
+ disabled={
820
+ streamFixedConfigOptionsDisabled ||
821
+ agentsCapabilities.length === 0
822
+ }
823
+ fullWidth
824
+ sx={{minWidth: '14em'}}>
825
+ {/* <InputLabel id="model-selector-input-label">
826
+ Model
827
+ </InputLabel> */}
828
+ {/* <Select
829
+ labelId="model-selector-input-label"
830
+ label="Model"
831
+ onChange={(e: SelectChangeEvent) => {
832
+ const newAgent =
833
+ agentsCapabilities.find(
834
+ (agent) => e.target.value === agent.name,
835
+ ) ?? null;
836
+ if (newAgent == null) {
837
+ console.error(
838
+ 'Unable to find agent with name',
839
+ e.target.value,
840
+ );
841
+ }
842
+ setAgentAndUpdateParams(newAgent);
843
+ }}
844
+ value={model ?? ''}>
845
+ {agentsCapabilities.map((agent) => (
846
+ <MenuItem value={agent.name} key={agent.name}>
847
+ {agent.name}
848
+ </MenuItem>
849
+ ))}
850
+ </Select> */}
851
+ </FormControl>
852
+
853
+ </Stack>
854
+
855
+ <Stack spacing={0.5}>
856
+ <FormLabel id="output-modes-radio-group-label">
857
+ Illy, can you please select the target language?
858
+ </FormLabel>
859
+
860
+ <Box sx={{paddingTop: 2, paddingBottom: 1}}>
861
+ <FormControl fullWidth sx={{minWidth: '14em'}}>
862
+ <InputLabel id="target-selector-input-label">
863
+ Target Language
864
+ </InputLabel>
865
+ <Select
866
+ labelId="target-selector-input-label"
867
+ label="Target Language"
868
+ onChange={(e: SelectChangeEvent) => {
869
+ setTargetLang(e.target.value);
870
+ onSetDynamicConfig({
871
+ targetLanguage: e.target.value,
872
+ });
873
+ }}
874
+ value={targetLang ?? ''}>
875
+ {currentAgent?.targetLangs.map((langCode) => (
876
+ <MenuItem value={langCode} key={langCode}>
877
+ {getLanguageFromThreeLetterCode(langCode) != null
878
+ ? `${getLanguageFromThreeLetterCode(
879
+ langCode,
880
+ )} (${langCode})`
881
+ : langCode}
882
+ </MenuItem>
883
+ ))}
884
+ </Select>
885
+ </FormControl>
886
+ </Box>
887
+
888
+ <Grid container>
889
+ <Grid item xs={12} sm={4}>
890
+ <FormControl
891
+ disabled={streamFixedConfigOptionsDisabled}>
892
+ <RadioGroup
893
+ aria-labelledby="output-modes-radio-group-label"
894
+ value={outputMode}
895
+ onChange={(e) =>
896
+ setOutputMode(
897
+ e.target.value as SupportedOutputMode,
898
+ )
899
+ }
900
+ name="output-modes-radio-buttons-group">
901
+ {
902
+ // TODO: Use supported modalities from agentCapabilities
903
+ SUPPORTED_OUTPUT_MODES.map(({value, label}) => (
904
+ <FormControlLabel
905
+ key={value}
906
+ value={value}
907
+ control={<Radio />}
908
+ label={label}
909
+ />
910
+ ))
911
+ }
912
+ </RadioGroup>
913
+ </FormControl>
914
+ </Grid>
915
+
916
+ <Grid item xs={12} sm={8}>
917
+ <Stack
918
+ direction="column"
919
+ spacing={1}
920
+ alignItems="flex-start"
921
+ sx={{flexGrow: 1}}>
922
+ {/* {currentAgent?.dynamicParams?.includes(
923
+ 'expressive',
924
+ ) && (
925
+ <FormControlLabel
926
+ control={
927
+ <Switch
928
+ checked={enableExpressive ?? false}
929
+ onChange={(
930
+ event: React.ChangeEvent<HTMLInputElement>,
931
+ ) => {
932
+ const newValue = event.target.checked;
933
+ setEnableExpressive(newValue);
934
+ onSetDynamicConfig({
935
+ expressive: newValue,
936
+ });
937
+ }}
938
+ />
939
+ }
940
+ label="Expressive"
941
+ />
942
+ )} */}
943
+
944
+ {isListener && (
945
+ <Box
946
+ sx={{
947
+ flexGrow: 1,
948
+ paddingX: 1.5,
949
+ paddingY: 1.5,
950
+ width: '100%',
951
+ }}>
952
+ {volumeSliderNode}
953
+ </Box>
954
+ )}
955
+ </Stack>
956
+ </Grid>
957
+ </Grid>
958
+ </Stack>
959
+
960
+ <Stack
961
+ direction="row"
962
+ spacing={2}
963
+ justifyContent="space-between">
964
+ <Box sx={{flex: 1}}>
965
+ <FormControl disabled={streamFixedConfigOptionsDisabled}>
966
+ {/* <FormLabel id="input-source-radio-group-label">
967
+ Input Source
968
+ </FormLabel> */}
969
+ {/* <RadioGroup
970
+ aria-labelledby="input-source-radio-group-label"
971
+ value={inputSource}
972
+ onChange={(e: React.ChangeEvent<HTMLInputElement>) =>
973
+ setInputSource(
974
+ e.target.value as SupportedInputSource,
975
+ )
976
+ }
977
+ name="input-source-radio-buttons-group">
978
+ {SUPPORTED_INPUT_SOURCES.map(({label, value}) => (
979
+ <FormControlLabel
980
+ key={value}
981
+ value={value}
982
+ control={<Radio />}
983
+ label={label}
984
+ />
985
+ ))} */}
986
+ {/* </RadioGroup> */}
987
+ </FormControl>
988
+ </Box>
989
+
990
+ <Box sx={{flex: 1, flexGrow: 2}}>
991
+ <FormControl disabled={streamFixedConfigOptionsDisabled}>
992
+ <FormLabel>Options</FormLabel>
993
+ <FormControlLabel
994
+ control={
995
+ <Checkbox
996
+ checked={
997
+ enableNoiseSuppression ??
998
+ AUDIO_STREAM_DEFAULTS[inputSource]
999
+ .noiseSuppression
1000
+ }
1001
+ onChange={(
1002
+ event: React.ChangeEvent<HTMLInputElement>,
1003
+ ) =>
1004
+ setEnableNoiseSuppression(event.target.checked)
1005
+ }
1006
+ />
1007
+ }
1008
+ label="Noise Suppression"
1009
+ />
1010
+ <FormControlLabel
1011
+ control={
1012
+ <Checkbox
1013
+ checked={
1014
+ enableEchoCancellation ??
1015
+ AUDIO_STREAM_DEFAULTS[inputSource]
1016
+ .echoCancellation
1017
+ }
1018
+ onChange={(
1019
+ event: React.ChangeEvent<HTMLInputElement>,
1020
+ ) =>
1021
+ setEnableEchoCancellation(event.target.checked)
1022
+ }
1023
+ />
1024
+ }
1025
+ label="Echo Cancellation (not recommended)"
1026
+ />
1027
+ <FormControlLabel
1028
+ control={
1029
+ <Checkbox
1030
+ checked={serverDebugFlag}
1031
+ onChange={(
1032
+ event: React.ChangeEvent<HTMLInputElement>,
1033
+ ) => setServerDebugFlag(event.target.checked)}
1034
+ />
1035
+ }
1036
+ label="Enable Server Debugging"
1037
+ />
1038
+ </FormControl>
1039
+ </Box>
1040
+ </Stack>
1041
+
1042
+ {isSpeaker &&
1043
+ isListener &&
1044
+ inputSource === 'userMedia' &&
1045
+ !enableEchoCancellation &&
1046
+ gain !== 0 && (
1047
+ <div>
1048
+ <Alert severity="warning" icon={<HeadphonesIcon />}>
1049
+ Headphones required to prevent feedback.
1050
+ </Alert>
1051
+ </div>
1052
+ )}
1053
+
1054
+ {isSpeaker && enableEchoCancellation && (
1055
+ <div>
1056
+ <Alert severity="warning">
1057
+ We don't recommend using echo cancellation as it may
1058
+ distort the input audio. If possible, use headphones and
1059
+ disable echo cancellation instead.
1060
+ </Alert>
1061
+ </div>
1062
+ )}
1063
+
1064
+ <Stack direction="row" spacing={2}>
1065
+ {streamingStatus === 'stopped' ? (
1066
+ <Button
1067
+ variant="contained"
1068
+ onClick={startStreaming}
1069
+ disabled={
1070
+ roomID == null ||
1071
+ // Prevent users from starting streaming if there is a server lock with an active session
1072
+ (serverState?.serverLock?.isActive === true &&
1073
+ serverState.serverLock.clientID !== clientID)
1074
+ }>
1075
+ {buttonLabelMap[streamingStatus]}
1076
+ </Button>
1077
+ ) : (
1078
+ <Button
1079
+ variant="contained"
1080
+ color={
1081
+ streamingStatus === 'running' ? 'error' : 'primary'
1082
+ }
1083
+ disabled={
1084
+ streamingStatus === 'starting' || roomID == null
1085
+ }
1086
+ onClick={stopStreaming}>
1087
+ {buttonLabelMap[streamingStatus]}
1088
+ </Button>
1089
+ )}
1090
+
1091
+ <Box>
1092
+ <Button
1093
+ variant="contained"
1094
+ aria-label={muted ? 'Unmute' : 'Mute'}
1095
+ color={muted ? 'info' : 'primary'}
1096
+ onClick={() => setMuted((prev) => !prev)}
1097
+ sx={{
1098
+ borderRadius: 100,
1099
+ paddingX: 0,
1100
+ minWidth: '36px',
1101
+ }}>
1102
+ {muted ? <MicOff /> : <Mic />}
1103
+ </Button>
1104
+ </Box>
1105
+
1106
+ {roomID == null ? null : (
1107
+ <Box
1108
+ sx={{
1109
+ flexGrow: 1,
1110
+ display: 'flex',
1111
+ justifyContent: 'flex-end',
1112
+ }}>
1113
+ {xrDialogComponent}
1114
+ </Box>
1115
+ )}
1116
+ </Stack>
1117
+
1118
+ {serverExceptions.length > 0 && (
1119
+ <div>
1120
+ <Alert severity="error">
1121
+ {`The server encountered an exception. See the browser console for details. You may need to refresh the page to continue using the app.`}
1122
+ </Alert>
1123
+ </div>
1124
+ )}
1125
+ {serverState != null && hasMaxSpeakers && (
1126
+ <div>
1127
+ <Alert severity="error">
1128
+ {`Maximum number of speakers reached. Please try again at a later time.`}
1129
+ </Alert>
1130
+ </div>
1131
+ )}
1132
+ {serverState != null &&
1133
+ serverState.totalActiveTranscoders >=
1134
+ TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD && (
1135
+ <div>
1136
+ <Alert severity="warning">
1137
+ {`The server currently has ${serverState?.totalActiveTranscoders} active streaming sessions. Performance may be degraded.`}
1138
+ </Alert>
1139
+ </div>
1140
+ )}
1141
+
1142
+ {serverState?.serverLock != null &&
1143
+ serverState.serverLock.clientID !== clientID && (
1144
+ <div>
1145
+ <Alert severity="warning">
1146
+ {`The server is currently locked. Priority will be given to that client when they are streaming, and your streaming session may be halted abruptly.`}
1147
+ </Alert>
1148
+ </div>
1149
+ )}
1150
+ </>
1151
+ )}
1152
+ </Stack>
1153
+
1154
+ {isListener && !isSpeaker && (
1155
+ <Box sx={{marginBottom: 1, marginTop: 2}}>
1156
+ {xrDialogComponent}
1157
+ </Box>
1158
+ )}
1159
+ </div>
1160
+
1161
+ {debugParam && roomID != null && <DebugSection />}
1162
+
1163
+ <div className="translation-text-container-sra horizontal-padding-sra">
1164
+ <Stack
1165
+ direction="row"
1166
+ spacing={2}
1167
+ sx={{mb: '16px', alignItems: 'center'}}>
1168
+ <Typography variant="h1" sx={{fontWeight: 700, flexGrow: 1}}>
1169
+ Transcript
1170
+ </Typography>
1171
+ {isSpeaker && (
1172
+ <Button
1173
+ variant="text"
1174
+ size="small"
1175
+ onClick={onClearTranscriptForAll}>
1176
+ Clear Transcript for All
1177
+ </Button>
1178
+ )}
1179
+ </Stack>
1180
+ <Stack direction="row">
1181
+ <div className="translation-text-sra">
1182
+ {translationSentencesWithEmptyStartingString.map(
1183
+ (sentence, index, arr) => {
1184
+ const isLast = index === arr.length - 1;
1185
+ const maybeRef = isLast
1186
+ ? {ref: lastTranslationResultRef}
1187
+ : {};
1188
+ return (
1189
+ <div className="text-chunk-sra" key={index} {...maybeRef}>
1190
+ <Typography variant="body1">
1191
+ {sentence}
1192
+ {animateTextDisplay && isLast && (
1193
+ <Blink
1194
+ intervalMs={CURSOR_BLINK_INTERVAL_MS}
1195
+ shouldBlink={
1196
+ (roomState?.activeTranscoders ?? 0) > 0
1197
+ }>
1198
+ <Typography
1199
+ component="span"
1200
+ variant="body1"
1201
+ sx={{
1202
+ display: 'inline-block',
1203
+ transform: 'scaleY(1.25) translateY(-1px)',
1204
+ }}>
1205
+ {'|'}
1206
+ </Typography>
1207
+ </Blink>
1208
+ )}
1209
+ </Typography>
1210
+ </div>
1211
+ );
1212
+ },
1213
+ )}
1214
+ </div>
1215
+ </Stack>
1216
+ </div>
1217
+ </div>
1218
+ </Box>
1219
+ </div>
1220
+ );
1221
+ }
streaming-react-app/src/URLParams.ts ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { getBooleanParamFlag, getStringParamFlag } from './getParamFlag';
2
+ import { URLParamsObject } from './types/URLParamsTypes';
3
+
4
+ /**
5
+ * These are the URL parameters you can provide to the app to change its behavior.
6
+ *
7
+ * Boolean flags can be set by just providing the flag name (`?autoJoin`), or by
8
+ * explicitly setting it to 1 (true) or 0 (false): `?autoJoin=1` or `?autoJoin=0`
9
+ *
10
+ * String flags require an explicit value: `?roomID=ABCD`
11
+ *
12
+ * Examples:
13
+ *
14
+ * - `http://localhost:5173/?roomID=BBCD&autoJoin&debug`
15
+ * - `http://localhost:5173/?serverURL=localhost:8000`
16
+
17
+ * @returns
18
+ */
19
+
20
+ export function getURLParams(): URLParamsObject {
21
+ return {
22
+ // animate the translation text when it arrives, typing it out one letter at a time
23
+ animateTextDisplay: getBooleanParamFlag('animateTextDisplay', true), // default to true;
24
+
25
+ // automatically join the room when the app loads. requires roomID to be set via url param as well
26
+ autoJoin: getBooleanParamFlag('autoJoin', false),
27
+
28
+ // automatically check the server debug flag as true
29
+ debug: getBooleanParamFlag('debug', false),
30
+
31
+ // Enable UI on the client that allows locking out other users of the server when it's being used for high profile demos
32
+ // NOTE: There is an escape hatch for disabling a server lock by setting the name field to remove_server_lock
33
+ enableServerLock: getBooleanParamFlag('enableServerLock', false),
34
+
35
+ // Pre-populate the Room Code field with the provided roomID. Can be used in conjunction with autoJoin to jump straight into the room
36
+ roomID: getStringParamFlag('roomID'),
37
+
38
+ // Use an alternate server URL as the streaming server (useful for pointing to dev servers: http://localhost:5173/?serverURL=localhost:8000)
39
+ serverURL: getStringParamFlag('serverURL'),
40
+
41
+ // Skip the popup dialog that displays within VR, which is mostly redundant with the web based dialog
42
+ skipARIntro: getBooleanParamFlag('skipARIntro', true), // default to true
43
+
44
+ // Shows the translation text in AR in front of an opaque panel covering all the text area
45
+ // single_block = original single text block with background
46
+ // lines = each line is a separate block and animates
47
+ // lines_with_background = adds a panel behind lines
48
+ ARTranscriptionType: getStringParamFlag('ARTranscriptionType') || 'lines',
49
+ };
50
+ }
streaming-react-app/src/assets/Roboto-msdf.json ADDED
The diff for this file is too large to render. See raw diff
 
streaming-react-app/src/assets/Roboto-msdf.png ADDED
streaming-react-app/src/assets/RobotoMono-Regular-msdf.json ADDED
The diff for this file is too large to render. See raw diff
 
streaming-react-app/src/assets/RobotoMono-Regular.png ADDED
streaming-react-app/src/assets/illy.svg ADDED
streaming-react-app/src/assets/illy2.svg ADDED
streaming-react-app/src/createBufferedSpeechPlayer.ts ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import debug from './debug';
2
+
3
+ type AddAudioToBufferFunction = (
4
+ samples: Array<number>,
5
+ sampleRate: number,
6
+ ) => void;
7
+
8
+ export type BufferedSpeechPlayer = {
9
+ addAudioToBuffer: AddAudioToBufferFunction;
10
+ setGain: (gain: number) => void;
11
+ start: () => void;
12
+ stop: () => void;
13
+ };
14
+
15
+ type Options = {
16
+ onEnded?: () => void;
17
+ onStarted?: () => void;
18
+ };
19
+
20
+ export default function createBufferedSpeechPlayer({
21
+ onStarted,
22
+ onEnded,
23
+ }: Options): BufferedSpeechPlayer {
24
+ const audioContext = new AudioContext();
25
+ const gainNode = audioContext.createGain();
26
+ gainNode.connect(audioContext.destination);
27
+
28
+ let unplayedAudioBuffers: Array<AudioBuffer> = [];
29
+
30
+ let currentPlayingBufferSource: AudioBufferSourceNode | null = null;
31
+
32
+ let isPlaying = false;
33
+
34
+ // This means that the player starts in the 'stopped' state, and you need to call player.start() for it to start playing
35
+ let shouldPlayWhenAudioAvailable = false;
36
+
37
+ const setGain = (gain: number) => {
38
+ gainNode.gain.setValueAtTime(gain, audioContext.currentTime);
39
+ };
40
+
41
+ const start = () => {
42
+ shouldPlayWhenAudioAvailable = true;
43
+ debug()?.start();
44
+ playNextBufferIfNotAlreadyPlaying();
45
+ };
46
+
47
+ // Stop will stop the audio and clear the buffers
48
+ const stop = () => {
49
+ shouldPlayWhenAudioAvailable = false;
50
+
51
+ // Stop the current buffers
52
+ currentPlayingBufferSource?.stop();
53
+ currentPlayingBufferSource = null;
54
+
55
+ unplayedAudioBuffers = [];
56
+
57
+ onEnded != null && onEnded();
58
+ isPlaying = false;
59
+ return;
60
+ };
61
+
62
+ const playNextBufferIfNotAlreadyPlaying = () => {
63
+ if (!isPlaying) {
64
+ playNextBuffer();
65
+ }
66
+ };
67
+
68
+ const playNextBuffer = () => {
69
+ if (shouldPlayWhenAudioAvailable === false) {
70
+ console.debug(
71
+ '[BufferedSpeechPlayer][playNextBuffer] Not playing any more audio because shouldPlayWhenAudioAvailable is false.',
72
+ );
73
+ // NOTE: we do not need to set isPlaying = false or call onEnded because that will be handled in the stop() function
74
+ return;
75
+ }
76
+ if (unplayedAudioBuffers.length === 0) {
77
+ console.debug(
78
+ '[BufferedSpeechPlayer][playNextBuffer] No buffers to play.',
79
+ );
80
+ if (isPlaying) {
81
+ isPlaying = false;
82
+ onEnded != null && onEnded();
83
+ }
84
+ return;
85
+ }
86
+
87
+ // If isPlaying is false, then we are starting playback fresh rather than continuing it, and should call onStarted
88
+ if (isPlaying === false) {
89
+ isPlaying = true;
90
+ onStarted != null && onStarted();
91
+ }
92
+
93
+ const source = audioContext.createBufferSource();
94
+
95
+ // Get the first unplayed buffer from the array, and remove it from the array
96
+ const buffer = unplayedAudioBuffers.shift() ?? null;
97
+ source.buffer = buffer;
98
+ console.debug(
99
+ `[BufferedSpeechPlayer] Playing buffer with ${source.buffer?.length} samples`,
100
+ );
101
+
102
+ source.connect(gainNode);
103
+
104
+ const startTime = new Date().getTime();
105
+ source.start();
106
+ currentPlayingBufferSource = source;
107
+ // This is probably not necessary, but it doesn't hurt
108
+ isPlaying = true;
109
+
110
+ // TODO: consider changing this to a while loop to avoid deep recursion
111
+ const onThisBufferPlaybackEnded = () => {
112
+ console.debug(
113
+ `[BufferedSpeechPlayer] Buffer with ${source.buffer?.length} samples ended.`,
114
+ );
115
+ source.removeEventListener('ended', onThisBufferPlaybackEnded);
116
+ const endTime = new Date().getTime();
117
+ debug()?.playedAudio(startTime, endTime, buffer);
118
+ currentPlayingBufferSource = null;
119
+
120
+ // We don't set isPlaying = false here because we are attempting to continue playing. It will get set to false if there are no more buffers to play
121
+ playNextBuffer();
122
+ };
123
+
124
+ source.addEventListener('ended', onThisBufferPlaybackEnded);
125
+ };
126
+
127
+ const addAudioToBuffer: AddAudioToBufferFunction = (samples, sampleRate) => {
128
+ const incomingArrayBufferChunk = audioContext.createBuffer(
129
+ // 1 channel
130
+ 1,
131
+ samples.length,
132
+ sampleRate,
133
+ );
134
+
135
+ incomingArrayBufferChunk.copyToChannel(
136
+ new Float32Array(samples),
137
+ // first channel
138
+ 0,
139
+ );
140
+
141
+ console.debug(
142
+ `[addAudioToBufferAndPlay] Adding buffer with ${incomingArrayBufferChunk.length} samples to queue.`,
143
+ );
144
+
145
+ unplayedAudioBuffers.push(incomingArrayBufferChunk);
146
+ debug()?.receivedAudio(
147
+ incomingArrayBufferChunk.length / incomingArrayBufferChunk.sampleRate,
148
+ );
149
+ const audioBuffersTableInfo = unplayedAudioBuffers.map((buffer, i) => {
150
+ return {
151
+ index: i,
152
+ duration: buffer.length / buffer.sampleRate,
153
+ samples: buffer.length,
154
+ };
155
+ });
156
+ const totalUnplayedDuration = unplayedAudioBuffers.reduce((acc, buffer) => {
157
+ return acc + buffer.length / buffer.sampleRate;
158
+ }, 0);
159
+
160
+ console.debug(
161
+ `[addAudioToBufferAndPlay] Current state of incoming audio buffers (${totalUnplayedDuration.toFixed(
162
+ 1,
163
+ )}s unplayed):`,
164
+ );
165
+ console.table(audioBuffersTableInfo);
166
+
167
+ if (shouldPlayWhenAudioAvailable) {
168
+ playNextBufferIfNotAlreadyPlaying();
169
+ }
170
+ };
171
+
172
+ return {addAudioToBuffer, setGain, stop, start};
173
+ }
streaming-react-app/src/cursorBlinkInterval.ts ADDED
@@ -0,0 +1 @@
 
 
1
+ export const CURSOR_BLINK_INTERVAL_MS = 500;
streaming-react-app/src/debug.ts ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {TYPING_ANIMATION_DELAY_MS} from './StreamingInterface';
2
+ import {getURLParams} from './URLParams';
3
+ import audioBuffertoWav from 'audiobuffer-to-wav';
4
+ import './StreamingInterface.css';
5
+
6
+ type StartEndTime = {
7
+ start: number;
8
+ end: number;
9
+ };
10
+
11
+ type StartEndTimeWithAudio = StartEndTime & {
12
+ float32Audio: Float32Array;
13
+ };
14
+
15
+ type Text = {
16
+ time: number;
17
+ chars: number;
18
+ };
19
+
20
+ type DebugTimings = {
21
+ receivedAudio: StartEndTime[];
22
+ playedAudio: StartEndTimeWithAudio[];
23
+ receivedText: Text[];
24
+ renderedText: StartEndTime[];
25
+ sentAudio: StartEndTimeWithAudio[];
26
+ startRenderTextTime: number | null;
27
+ startRecordingTime: number | null;
28
+ receivedAudioSampleRate: number | null;
29
+ };
30
+
31
+ function getInitialTimings(): DebugTimings {
32
+ return {
33
+ receivedAudio: [],
34
+ playedAudio: [],
35
+ receivedText: [],
36
+ renderedText: [],
37
+ sentAudio: [],
38
+ startRenderTextTime: null,
39
+ startRecordingTime: null,
40
+ receivedAudioSampleRate: null,
41
+ };
42
+ }
43
+
44
+ function downloadAudioBuffer(audioBuffer: AudioBuffer, fileName: string): void {
45
+ const wav = audioBuffertoWav(audioBuffer);
46
+ const wavBlob = new Blob([new DataView(wav)], {
47
+ type: 'audio/wav',
48
+ });
49
+ const url = URL.createObjectURL(wavBlob);
50
+ const anchor = document.createElement('a');
51
+ anchor.href = url;
52
+ anchor.target = '_blank';
53
+ anchor.download = fileName;
54
+ anchor.click();
55
+ }
56
+
57
+ // Uncomment for debugging without download
58
+ // function playAudioBuffer(audioBuffer: AudioBuffer): void {
59
+ // const audioContext = new AudioContext();
60
+ // const source = audioContext.createBufferSource();
61
+
62
+ // source.buffer = audioBuffer;
63
+ // source.connect(audioContext.destination);
64
+ // source.start();
65
+ // }
66
+
67
+ // Accumulate timings and audio / text translation samples for debugging and exporting
68
+ class DebugTimingsManager {
69
+ timings: DebugTimings = getInitialTimings();
70
+
71
+ start(): void {
72
+ this.timings = getInitialTimings();
73
+ this.timings.startRecordingTime = new Date().getTime();
74
+ }
75
+
76
+ sentAudio(event: AudioProcessingEvent): void {
77
+ const end = new Date().getTime();
78
+ const start = end - event.inputBuffer.duration * 1000;
79
+ // Copy or else buffer seems to be re-used
80
+ const float32Audio = new Float32Array(event.inputBuffer.getChannelData(0));
81
+ this.timings.sentAudio.push({
82
+ start,
83
+ end,
84
+ float32Audio,
85
+ });
86
+ }
87
+
88
+ receivedText(text: string): void {
89
+ this.timings.receivedText.push({
90
+ time: new Date().getTime(),
91
+ chars: text.length,
92
+ });
93
+ }
94
+
95
+ startRenderText(): void {
96
+ if (this.timings.startRenderTextTime == null) {
97
+ this.timings.startRenderTextTime = new Date().getTime();
98
+ }
99
+ }
100
+
101
+ endRenderText(): void {
102
+ if (this.timings.startRenderTextTime == null) {
103
+ console.warn(
104
+ 'Wrong timings of start / end rendering text. startRenderText is null',
105
+ );
106
+ return;
107
+ }
108
+
109
+ this.timings.renderedText.push({
110
+ start: this.timings.startRenderTextTime as number,
111
+ end: new Date().getTime(),
112
+ });
113
+ this.timings.startRenderTextTime = null;
114
+ }
115
+
116
+ receivedAudio(duration: number): void {
117
+ const start = new Date().getTime();
118
+ this.timings.receivedAudio.push({
119
+ start,
120
+ end: start + duration * 1000,
121
+ });
122
+ }
123
+
124
+ playedAudio(start: number, end: number, buffer: AudioBuffer | null): void {
125
+ if (buffer != null) {
126
+ if (this.timings.receivedAudioSampleRate == null) {
127
+ this.timings.receivedAudioSampleRate = buffer.sampleRate;
128
+ }
129
+ if (this.timings.receivedAudioSampleRate != buffer.sampleRate) {
130
+ console.error(
131
+ 'Sample rates of received audio are unequal, will fail to reconstruct debug audio',
132
+ this.timings.receivedAudioSampleRate,
133
+ buffer.sampleRate,
134
+ );
135
+ }
136
+ }
137
+ this.timings.playedAudio.push({
138
+ start,
139
+ end,
140
+ float32Audio:
141
+ buffer == null
142
+ ? new Float32Array()
143
+ : new Float32Array(buffer.getChannelData(0)),
144
+ });
145
+ }
146
+
147
+ getChartData() {
148
+ const columns = [
149
+ {type: 'string', id: 'Series'},
150
+ {type: 'date', id: 'Start'},
151
+ {type: 'date', id: 'End'},
152
+ ];
153
+ return [
154
+ columns,
155
+ ...this.timings.sentAudio.map((sentAudio) => [
156
+ 'Sent Audio',
157
+ new Date(sentAudio.start),
158
+ new Date(sentAudio.end),
159
+ ]),
160
+ ...this.timings.receivedAudio.map((receivedAudio) => [
161
+ 'Received Audio',
162
+ new Date(receivedAudio.start),
163
+ new Date(receivedAudio.end),
164
+ ]),
165
+ ...this.timings.playedAudio.map((playedAudio) => [
166
+ 'Played Audio',
167
+ new Date(playedAudio.start),
168
+ new Date(playedAudio.end),
169
+ ]),
170
+ // Best estimate duration by multiplying length with animation duration for each letter
171
+ ...this.timings.receivedText.map((receivedText) => [
172
+ 'Received Text',
173
+ new Date(receivedText.time),
174
+ new Date(
175
+ receivedText.time + receivedText.chars * TYPING_ANIMATION_DELAY_MS,
176
+ ),
177
+ ]),
178
+ ...this.timings.renderedText.map((renderedText) => [
179
+ 'Rendered Text',
180
+ new Date(renderedText.start),
181
+ new Date(renderedText.end),
182
+ ]),
183
+ ];
184
+ }
185
+
186
+ downloadInputAudio() {
187
+ const audioContext = new AudioContext();
188
+ const totalLength = this.timings.sentAudio.reduce((acc, cur) => {
189
+ return acc + cur?.float32Audio?.length ?? 0;
190
+ }, 0);
191
+ if (totalLength === 0) {
192
+ return;
193
+ }
194
+
195
+ const incomingArrayBuffer = audioContext.createBuffer(
196
+ 1, // 1 channel
197
+ totalLength,
198
+ audioContext.sampleRate,
199
+ );
200
+
201
+ const buffer = incomingArrayBuffer.getChannelData(0);
202
+ let i = 0;
203
+ this.timings.sentAudio.forEach((sentAudio) => {
204
+ sentAudio.float32Audio.forEach((bytes) => {
205
+ buffer[i++] = bytes;
206
+ });
207
+ });
208
+
209
+ // Play for debugging
210
+ // playAudioBuffer(incomingArrayBuffer);
211
+ downloadAudioBuffer(incomingArrayBuffer, `input_audio.wav`);
212
+ }
213
+
214
+ downloadOutputAudio() {
215
+ const playedAudio = this.timings.playedAudio;
216
+ const sampleRate = this.timings.receivedAudioSampleRate;
217
+ if (
218
+ playedAudio.length === 0 ||
219
+ this.timings.startRecordingTime == null ||
220
+ sampleRate == null
221
+ ) {
222
+ return null;
223
+ }
224
+
225
+ let previousEndTime = this.timings.startRecordingTime;
226
+ const audioArray: number[] = [];
227
+ playedAudio.forEach((audio) => {
228
+ const delta = (audio.start - previousEndTime) / 1000;
229
+ for (let i = 0; i < delta * sampleRate; i++) {
230
+ audioArray.push(0.0);
231
+ }
232
+ audio.float32Audio.forEach((bytes) => audioArray.push(bytes));
233
+ previousEndTime = audio.end;
234
+ });
235
+ const audioContext = new AudioContext();
236
+ const incomingArrayBuffer = audioContext.createBuffer(
237
+ 1, // 1 channel
238
+ audioArray.length,
239
+ sampleRate,
240
+ );
241
+
242
+ incomingArrayBuffer.copyToChannel(
243
+ new Float32Array(audioArray),
244
+ 0, // first channel
245
+ );
246
+
247
+ // Play for debugging
248
+ // playAudioBuffer(incomingArrayBuffer);
249
+ downloadAudioBuffer(incomingArrayBuffer, 'output_audio.wav');
250
+ }
251
+ }
252
+
253
+ const debugSingleton = new DebugTimingsManager();
254
+ export default function debug(): DebugTimingsManager | null {
255
+ const debugParam = getURLParams().debug;
256
+ return debugParam ? debugSingleton : null;
257
+ }
streaming-react-app/src/float32To16BitPCM.ts ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export default function float32To16BitPCM(
2
+ float32Arr: Float32Array,
3
+ ): Int16Array {
4
+ const pcm16bit = new Int16Array(float32Arr.length);
5
+ for (let i = 0; i < float32Arr.length; ++i) {
6
+ // force number in [-1,1]
7
+ const s = Math.max(-1, Math.min(1, float32Arr[i]));
8
+
9
+ /**
10
+ * convert 32 bit float to 16 bit int pcm audio
11
+ * 0x8000 = minimum int16 value, 0x7fff = maximum int16 value
12
+ */
13
+ pcm16bit[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
14
+ }
15
+ return pcm16bit;
16
+ }
streaming-react-app/src/generateNewRoomID.ts ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {random} from 'lodash';
2
+
3
+ // const USABLE_CHARACTERS = 'BCDFGHJKMPQRTVWXY2346789';
4
+ const USABLE_CHARACTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
5
+ const ID_LENGTH = 4;
6
+
7
+ export function isValidRoomID(id: string | null | undefined): boolean {
8
+ if (id == null) {
9
+ return false;
10
+ }
11
+ if (id.length !== ID_LENGTH) {
12
+ return false;
13
+ }
14
+ return isValidPartialRoomID(id);
15
+ }
16
+
17
+ export function isValidPartialRoomID(roomID: string): boolean {
18
+ return (
19
+ roomID.length <= ID_LENGTH &&
20
+ roomID.split('').every((char) => USABLE_CHARACTERS.includes(char))
21
+ );
22
+ }
23
+
24
+ export default function generateNewRoomID(): string {
25
+ return Array.from(
26
+ {length: ID_LENGTH},
27
+ () => USABLE_CHARACTERS[random(USABLE_CHARACTERS.length - 1)],
28
+ ).join('');
29
+ }
30
+
31
+ export function getSequentialRoomIDForTestingGenerator(): () => string {
32
+ let counter = 0;
33
+
34
+ return function generateNextRoomID(): string {
35
+ const counterInBase: string = Number(counter)
36
+ .toString(USABLE_CHARACTERS.length)
37
+ .padStart(ID_LENGTH, '0');
38
+
39
+ if (counterInBase.length > ID_LENGTH) {
40
+ throw new Error(
41
+ 'Ran out of unique room IDs from the sequential generator',
42
+ );
43
+ }
44
+
45
+ const result = counterInBase
46
+ .split('')
47
+ .map(
48
+ (digit) => USABLE_CHARACTERS[parseInt(digit, USABLE_CHARACTERS.length)],
49
+ )
50
+ .join('');
51
+
52
+ counter++;
53
+
54
+ return result;
55
+ };
56
+ }
streaming-react-app/src/getParamFlag.ts ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type {URLParamNames} from './types/URLParamsTypes';
2
+
3
+ export function getBooleanParamFlag(
4
+ flag: URLParamNames,
5
+ defaultValue?: boolean,
6
+ ): boolean {
7
+ const paramFlagValue = getBooleanParamFlagWithoutDefault(flag);
8
+
9
+ if (paramFlagValue == null) {
10
+ // The default value for paramFlags is false, unless they explicitly provide a
11
+ // defaultValue via the config
12
+ return defaultValue ?? false;
13
+ }
14
+
15
+ return paramFlagValue;
16
+ }
17
+
18
+ export function getBooleanParamFlagWithoutDefault(
19
+ flag: URLParamNames,
20
+ ): boolean | null {
21
+ const urlParams = new URLSearchParams(window.location.search);
22
+
23
+ if (urlParams.get(flag) == null) {
24
+ return null;
25
+ }
26
+
27
+ return urlParams.get(flag) !== '0';
28
+ }
29
+
30
+ export function getStringParamFlag(
31
+ flag: URLParamNames,
32
+ defaultValue?: string,
33
+ ): string | null {
34
+ const urlParams = new URLSearchParams(window.location.search);
35
+
36
+ const param = urlParams.get(flag);
37
+
38
+ return param ?? defaultValue ?? null;
39
+ }
streaming-react-app/src/getTranslationSentencesFromReceivedData.ts ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {ServerTextData, TranslationSentences} from './types/StreamingTypes';
2
+
3
+ export default function getTranslationSentencesFromReceivedData(
4
+ receivedData: Array<ServerTextData>,
5
+ ): TranslationSentences {
6
+ return receivedData
7
+ .reduce(
8
+ (acc, data) => {
9
+ const newAcc = [
10
+ ...acc.slice(0, -1),
11
+ acc[acc.length - 1].trim() + ' ' + data.payload,
12
+ ];
13
+ if (data.eos) {
14
+ newAcc.push('');
15
+ }
16
+
17
+ return newAcc;
18
+ },
19
+ [''],
20
+ )
21
+ .filter((s) => s.trim().length !== 0);
22
+ }
streaming-react-app/src/isScrolledToDocumentBottom.ts ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export default function isScrolledToDocumentBottom(
2
+ bufferPx: number = 0,
3
+ ): boolean {
4
+ if (
5
+ window.innerHeight + window.scrollY >=
6
+ document.body.offsetHeight - bufferPx
7
+ ) {
8
+ return true;
9
+ }
10
+ return false;
11
+ }
streaming-react-app/src/languageLookup.ts ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const LANG3_TO_NAME = {
2
+ afr: 'afrikaans',
3
+ amh: 'amharic',
4
+ arb: 'arabic',
5
+ asm: 'assamese',
6
+ azj: 'azerbaijani',
7
+ bak: 'bashkir',
8
+ bel: 'belarusian',
9
+ ben: 'bengali',
10
+ bod: 'tibetan',
11
+ bos: 'bosnian',
12
+ bre: 'breton',
13
+ bul: 'bulgarian',
14
+ cat: 'catalan',
15
+ ces: 'czech',
16
+ cmn: 'chinese',
17
+ cym: 'welsh',
18
+ dan: 'danish',
19
+ deu: 'german',
20
+ ell: 'greek',
21
+ eng: 'english',
22
+ est: 'estonian',
23
+ eus: 'basque',
24
+ fao: 'faroese',
25
+ fin: 'finnish',
26
+ fra: 'french',
27
+ glg: 'galician',
28
+ guj: 'gujarati',
29
+ hat: 'haitian creole',
30
+ hau: 'hausa',
31
+ haw: 'hawaiian',
32
+ heb: 'hebrew',
33
+ hin: 'hindi',
34
+ hrv: 'croatian',
35
+ hun: 'hungarian',
36
+ hye: 'armenian',
37
+ ind: 'indonesian',
38
+ isl: 'icelandic',
39
+ ita: 'italian',
40
+ jav: 'javanese',
41
+ jpn: 'japanese',
42
+ kan: 'kannada',
43
+ kat: 'georgian',
44
+ kaz: 'kazakh',
45
+ khk: 'mongolian',
46
+ khm: 'khmer',
47
+ kor: 'korean',
48
+ lao: 'lao',
49
+ lat: 'latin',
50
+ lin: 'lingala',
51
+ lit: 'lithuanian',
52
+ ltz: 'luxembourgish',
53
+ lvs: 'latvian',
54
+ mal: 'malayalam',
55
+ mar: 'marathi',
56
+ mkd: 'macedonian',
57
+ mlg: 'malagasy',
58
+ mlt: 'maltese',
59
+ mri: 'maori',
60
+ mya: 'myanmar',
61
+ nld: 'dutch',
62
+ nno: 'nynorsk',
63
+ nob: 'norwegian',
64
+ npi: 'nepali',
65
+ oci: 'occitan',
66
+ pan: 'punjabi',
67
+ pbt: 'pashto',
68
+ pes: 'persian',
69
+ pol: 'polish',
70
+ por: 'portuguese',
71
+ ron: 'romanian',
72
+ rus: 'russian',
73
+ san: 'sanskrit',
74
+ sin: 'sinhala',
75
+ slk: 'slovak',
76
+ slv: 'slovenian',
77
+ sna: 'shona',
78
+ snd: 'sindhi',
79
+ som: 'somali',
80
+ spa: 'spanish',
81
+ sqi: 'albanian',
82
+ srp: 'serbian',
83
+ sun: 'sundanese',
84
+ swe: 'swedish',
85
+ swh: 'swahili',
86
+ tam: 'tamil',
87
+ tat: 'tatar',
88
+ tel: 'telugu',
89
+ tgk: 'tajik',
90
+ tgl: 'tagalog',
91
+ tha: 'thai',
92
+ tuk: 'turkmen',
93
+ tur: 'turkish',
94
+ ukr: 'ukrainian',
95
+ urd: 'urdu',
96
+ uzn: 'uzbek',
97
+ vie: 'vietnamese',
98
+ yid: 'yiddish',
99
+ yor: 'yoruba',
100
+ zlm: 'malay',
101
+ };
102
+
103
+ export function getLanguageFromThreeLetterCode(
104
+ lang3Code: string,
105
+ ): string | null {
106
+ try {
107
+ const name = LANG3_TO_NAME[lang3Code] ?? null;
108
+ if (name == null) {
109
+ return null;
110
+ }
111
+ const capitalizedWords = name
112
+ .split(' ')
113
+ .map((word: string) => word[0].toUpperCase() + word.slice(1));
114
+ return capitalizedWords.join(' ');
115
+ } catch (e) {
116
+ console.warn(`Unable to get language name for code ${lang3Code}: ${e}`);
117
+ }
118
+ return null;
119
+ }
streaming-react-app/src/main.tsx ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react';
2
+ import ReactDOM from 'react-dom/client';
3
+ import App from './App.tsx';
4
+
5
+ ReactDOM.createRoot(document.getElementById('root')!).render(
6
+ <React.StrictMode>
7
+ <App />
8
+ </React.StrictMode>,
9
+ );
streaming-react-app/src/react-xr/ARButton.tsx ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as THREE from 'three';
2
+ import {Button} from '@mui/material';
3
+ import {useCallback, useEffect, useState} from 'react';
4
+ import {BufferedSpeechPlayer} from '../createBufferedSpeechPlayer';
5
+
6
+ type Props = {
7
+ bufferedSpeechPlayer: BufferedSpeechPlayer;
8
+ renderer: THREE.WebGLRenderer | null;
9
+ onARVisible?: () => void;
10
+ onARHidden?: () => void;
11
+ };
12
+
13
+ export default function ARButton({
14
+ bufferedSpeechPlayer,
15
+ renderer,
16
+ onARVisible,
17
+ onARHidden,
18
+ }: Props) {
19
+ const [session, setSession] = useState<XRSession | null>(null);
20
+ const [supported, setSupported] = useState<boolean>(true);
21
+
22
+ useEffect(() => {
23
+ if (!navigator.xr) {
24
+ setSupported(false);
25
+ return;
26
+ }
27
+ navigator.xr.isSessionSupported('immersive-ar').then((supported) => {
28
+ setSupported(supported);
29
+ });
30
+ }, []);
31
+
32
+ const resetBuffers = useCallback(
33
+ (event: XRSessionEvent) => {
34
+ const session = event.target;
35
+ if (!(session instanceof XRSession)) {
36
+ return;
37
+ }
38
+ switch (session.visibilityState) {
39
+ case 'visible':
40
+ console.log('Restarting speech player, device is visible');
41
+ bufferedSpeechPlayer.stop();
42
+ bufferedSpeechPlayer.start();
43
+ onARVisible?.();
44
+ break;
45
+ case 'hidden':
46
+ console.log('Stopping speech player, device is hidden');
47
+ bufferedSpeechPlayer.stop();
48
+ bufferedSpeechPlayer.start();
49
+ onARHidden?.();
50
+ break;
51
+ }
52
+ },
53
+ [bufferedSpeechPlayer],
54
+ );
55
+
56
+ async function onSessionStarted(session: XRSession) {
57
+ setSession(session);
58
+
59
+ session.onvisibilitychange = resetBuffers;
60
+ session.onend = onSessionEnded;
61
+
62
+ await renderer.xr.setSession(session);
63
+ }
64
+
65
+ function onSessionEnded() {
66
+ setSession(null);
67
+ }
68
+
69
+ const onClick = () => {
70
+ if (session === null) {
71
+ navigator.xr!.requestSession('immersive-ar').then(onSessionStarted);
72
+ } else {
73
+ session.end();
74
+ }
75
+ };
76
+ return (
77
+ <Button
78
+ variant="contained"
79
+ onClick={onClick}
80
+ disabled={!supported || renderer == null}
81
+ sx={{mt: 1}}>
82
+ {supported
83
+ ? renderer != null
84
+ ? 'Enter AR'
85
+ : 'Initializing AR...'
86
+ : 'AR Not Supported'}
87
+ </Button>
88
+ );
89
+ }
streaming-react-app/src/react-xr/Button.tsx ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useRef, useEffect} from 'react';
2
+ import * as THREE from 'three';
3
+ import {extend} from '@react-three/fiber';
4
+ import ThreeMeshUI from 'three-mesh-ui';
5
+ import ThreeMeshUIText, {ThreeMeshUITextType} from './ThreeMeshUIText';
6
+ import {Interactive} from '@react-three/xr';
7
+
8
+ /**
9
+ * Using `?url` at the end of this import tells vite this is a static asset, and
10
+ * provides us a URL to the hashed version of the file when the project is built.
11
+ * See: https://vitejs.dev/guide/assets.html#explicit-url-imports
12
+ */
13
+ import robotoFontFamilyJson from '../assets/RobotoMono-Regular-msdf.json?url';
14
+ import robotoFontTexture from '../assets/RobotoMono-Regular.png';
15
+
16
+ extend(ThreeMeshUI);
17
+
18
+ /**
19
+ * Button component that renders as a three-mesh-ui block
20
+ */
21
+ export default function Button({
22
+ onClick,
23
+ content,
24
+ width,
25
+ height,
26
+ fontSize,
27
+ borderRadius,
28
+ padding,
29
+ }) {
30
+ const button = useRef<JSX.IntrinsicElements['block']>();
31
+ const textRef = useRef<ThreeMeshUITextType>();
32
+
33
+ useEffect(() => {
34
+ if (textRef.current != null) {
35
+ textRef.current.set({content});
36
+ }
37
+ }, [textRef, content]);
38
+
39
+ useEffect(() => {
40
+ if (!button.current) {
41
+ return;
42
+ }
43
+ button.current.setupState({
44
+ state: 'hovered',
45
+ attributes: {
46
+ offset: 0.002,
47
+ backgroundColor: new THREE.Color(0x607b8f),
48
+ fontColor: new THREE.Color(0xffffff),
49
+ },
50
+ });
51
+ button.current.setupState({
52
+ state: 'idle',
53
+ attributes: {
54
+ offset: 0.001,
55
+ backgroundColor: new THREE.Color(0x465a69),
56
+ fontColor: new THREE.Color(0xffffff),
57
+ },
58
+ });
59
+ button.current.setupState({
60
+ state: 'selected',
61
+ attributes: {
62
+ offset: 0.005,
63
+ backgroundColor: new THREE.Color(0x000000),
64
+ fontColor: new THREE.Color(0xffffff),
65
+ },
66
+ });
67
+ button.current.setState('idle');
68
+ }, []);
69
+
70
+ const args = [
71
+ {
72
+ width,
73
+ height,
74
+ fontSize,
75
+ padding,
76
+ justifyContent: 'end',
77
+ textAlign: 'center',
78
+ alignItems: 'center',
79
+ borderRadius,
80
+ fontFamily: robotoFontFamilyJson,
81
+ fontTexture: robotoFontTexture,
82
+ backgroundOpacity: 1,
83
+ backgroundColor: new THREE.Color(0x779092),
84
+ fontColor: new THREE.Color(0x000000),
85
+ },
86
+ ];
87
+
88
+ return (
89
+ <Interactive
90
+ // These are for XR mode
91
+ onSelect={() => {
92
+ onClick();
93
+ }}
94
+ onHover={() => button.current.setState('hovered')}
95
+ onBlur={() => button.current.setState('idle')}
96
+ onSelectStart={() => button.current.setState('selected')}
97
+ onSelectEnd={() => button.current.setState('idle')}>
98
+ <block
99
+ // These are for non-XR modes
100
+ onPointerEnter={() => button.current.setState('hovered')}
101
+ onPointerLeave={() => button.current.setState('idle')}
102
+ onPointerDown={() => button.current.setState('selected')}
103
+ onPointerUp={() => {
104
+ button.current.setState('hovered');
105
+ onClick();
106
+ }}>
107
+ <block args={args} ref={button}>
108
+ <ThreeMeshUIText
109
+ ref={textRef}
110
+ fontColor={new THREE.Color(0xffffff)}
111
+ content={content}
112
+ />
113
+ </block>
114
+ </block>
115
+ </Interactive>
116
+ );
117
+ }
streaming-react-app/src/react-xr/Colors.ts ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import * as THREE from 'three';
2
+
3
+ export const WHITE = new THREE.Color('#FFFFFF');
4
+ export const BLACK = new THREE.Color('#000000');
5
+ export const RED = new THREE.Color('red');
6
+ export const BLUE = new THREE.Color('blue');
streaming-react-app/src/react-xr/MovementController.tsx ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useRef} from 'react';
2
+ import {useFrame} from '@react-three/fiber';
3
+ import {useController, useXR} from '@react-three/xr';
4
+ import * as THREE from 'three';
5
+
6
+ const USE_HORIZONTAL = true;
7
+ const USE_VERTICAL = true;
8
+ const USE_ROTATION = true;
9
+ const HORIZONTAL_AXIS = 2;
10
+ const VERTICAL_AXIS = 3;
11
+ const ROTATION_AXIS = 2;
12
+ const SENSITIVITY = 0.05;
13
+ const DEADZONE = 0.05;
14
+
15
+ /**
16
+ * Component to add into the ThreeJS canvas that reads controller (Quest) inputs to change camera position
17
+ */
18
+ export default function MovementController() {
19
+ const xr = useXR();
20
+ const controller = useController('right');
21
+ const forward = useRef(new THREE.Vector3());
22
+ const horizontal = useRef(new THREE.Vector3());
23
+
24
+ useFrame(() => {
25
+ const player = xr.player;
26
+ const camera = xr.player.children[0];
27
+ const cameraMatrix = camera.matrixWorld.elements;
28
+ forward.current
29
+ .set(-cameraMatrix[8], -cameraMatrix[9], -cameraMatrix[10])
30
+ .normalize();
31
+
32
+ const axes = controller?.inputSource?.gamepad?.axes ?? [0, 0, 0, 0];
33
+
34
+ if (USE_HORIZONTAL) {
35
+ horizontal.current.copy(forward.current);
36
+ horizontal.current.cross(camera.up).normalize();
37
+
38
+ player.position.add(
39
+ horizontal.current.multiplyScalar(
40
+ (Math.abs(axes[HORIZONTAL_AXIS]) > DEADZONE
41
+ ? axes[HORIZONTAL_AXIS]
42
+ : 0) * SENSITIVITY,
43
+ ),
44
+ );
45
+ }
46
+
47
+ if (USE_VERTICAL) {
48
+ player.position.add(
49
+ forward.current.multiplyScalar(
50
+ (Math.abs(axes[VERTICAL_AXIS]) > DEADZONE ? axes[VERTICAL_AXIS] : 0) *
51
+ SENSITIVITY,
52
+ ),
53
+ );
54
+ }
55
+
56
+ if (USE_ROTATION) {
57
+ player.rotation.y -=
58
+ (Math.abs(axes[ROTATION_AXIS]) > DEADZONE ? axes[ROTATION_AXIS] : 0) *
59
+ SENSITIVITY;
60
+ }
61
+ });
62
+
63
+ return <></>;
64
+ }