Spaces:
Sleeping
Sleeping
Merge commit '2d0e88443951d8635ba0d50f005a9e9c30aaee05' as 'libs/whisper_streaming'
Browse files- libs/whisper_streaming/.gitignore +129 -0
- libs/whisper_streaming/LICENSE +21 -0
- libs/whisper_streaming/README.md +263 -0
- libs/whisper_streaming/line_packet.py +93 -0
- libs/whisper_streaming/silero_vad.py +97 -0
- libs/whisper_streaming/whisper_online.py +846 -0
- libs/whisper_streaming/whisper_online_server.py +184 -0
libs/whisper_streaming/.gitignore
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/
|
108 |
+
venv/
|
109 |
+
ENV/
|
110 |
+
env.bak/
|
111 |
+
venv.bak/
|
112 |
+
|
113 |
+
# Spyder project settings
|
114 |
+
.spyderproject
|
115 |
+
.spyproject
|
116 |
+
|
117 |
+
# Rope project settings
|
118 |
+
.ropeproject
|
119 |
+
|
120 |
+
# mkdocs documentation
|
121 |
+
/site
|
122 |
+
|
123 |
+
# mypy
|
124 |
+
.mypy_cache/
|
125 |
+
.dmypy.json
|
126 |
+
dmypy.json
|
127 |
+
|
128 |
+
# Pyre type checker
|
129 |
+
.pyre/
|
libs/whisper_streaming/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2023 ÚFAL
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
libs/whisper_streaming/README.md
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# whisper_streaming
|
2 |
+
Whisper realtime streaming for long speech-to-text transcription and translation
|
3 |
+
|
4 |
+
**Turning Whisper into Real-Time Transcription System**
|
5 |
+
|
6 |
+
Demonstration paper, by [Dominik Macháček](https://ufal.mff.cuni.cz/dominik-machacek), [Raj Dabre](https://prajdabre.github.io/), [Ondřej Bojar](https://ufal.mff.cuni.cz/ondrej-bojar), 2023
|
7 |
+
|
8 |
+
Abstract: Whisper is one of the recent state-of-the-art multilingual speech recognition and translation models, however, it is not designed for real-time transcription. In this paper, we build on top of Whisper and create Whisper-Streaming, an implementation of real-time speech transcription and translation of Whisper-like models. Whisper-Streaming uses local agreement policy with self-adaptive latency to enable streaming transcription. We show that Whisper-Streaming achieves high quality and 3.3 seconds latency on unsegmented long-form speech transcription test set, and we demonstrate its robustness and practical usability as a component in live transcription service at a multilingual conference.
|
9 |
+
|
10 |
+
|
11 |
+
[Paper PDF](https://aclanthology.org/2023.ijcnlp-demo.3.pdf), [Demo video](https://player.vimeo.com/video/840442741)
|
12 |
+
|
13 |
+
[Slides](http://ufallab.ms.mff.cuni.cz/~machacek/pre-prints/AACL23-2.11.2023-Turning-Whisper-oral.pdf) -- 15 minutes oral presentation at IJCNLP-AACL 2023
|
14 |
+
|
15 |
+
Please, cite us. [ACL Anthology](https://aclanthology.org/2023.ijcnlp-demo.3/), [Bibtex citation](https://aclanthology.org/2023.ijcnlp-demo.3.bib):
|
16 |
+
|
17 |
+
```
|
18 |
+
@inproceedings{machacek-etal-2023-turning,
|
19 |
+
title = "Turning Whisper into Real-Time Transcription System",
|
20 |
+
author = "Mach{\'a}{\v{c}}ek, Dominik and
|
21 |
+
Dabre, Raj and
|
22 |
+
Bojar, Ond{\v{r}}ej",
|
23 |
+
editor = "Saha, Sriparna and
|
24 |
+
Sujaini, Herry",
|
25 |
+
booktitle = "Proceedings of the 13th International Joint Conference on Natural Language Processing and the 3rd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics: System Demonstrations",
|
26 |
+
month = nov,
|
27 |
+
year = "2023",
|
28 |
+
address = "Bali, Indonesia",
|
29 |
+
publisher = "Association for Computational Linguistics",
|
30 |
+
url = "https://aclanthology.org/2023.ijcnlp-demo.3",
|
31 |
+
pages = "17--24",
|
32 |
+
}
|
33 |
+
```
|
34 |
+
|
35 |
+
## Installation
|
36 |
+
|
37 |
+
1) ``pip install librosa soundfile`` -- audio processing library
|
38 |
+
|
39 |
+
2) Whisper backend.
|
40 |
+
|
41 |
+
Several alternative backends are integrated. The most recommended one is [faster-whisper](https://github.com/guillaumekln/faster-whisper) with GPU support. Follow their instructions for NVIDIA libraries -- we succeeded with CUDNN 8.5.0 and CUDA 11.7. Install with `pip install faster-whisper`.
|
42 |
+
|
43 |
+
Alternative, less restrictive, but slower backend is [whisper-timestamped](https://github.com/linto-ai/whisper-timestamped): `pip install git+https://github.com/linto-ai/whisper-timestamped`
|
44 |
+
|
45 |
+
Thirdly, it's also possible to run this software from the [OpenAI Whisper API](https://platform.openai.com/docs/api-reference/audio/createTranscription). This solution is fast and requires no GPU, just a small VM will suffice, but you will need to pay OpenAI for api access. Also note that, since each audio fragment is processed multiple times, the [price](https://openai.com/pricing) will be higher than obvious from the pricing page, so keep an eye on costs while using. Setting a higher chunk-size will reduce costs significantly.
|
46 |
+
Install with: `pip install openai`
|
47 |
+
|
48 |
+
For running with the openai-api backend, make sure that your [OpenAI api key](https://platform.openai.com/api-keys) is set in the `OPENAI_API_KEY` environment variable. For example, before running, do: `export OPENAI_API_KEY=sk-xxx` with *sk-xxx* replaced with your api key.
|
49 |
+
|
50 |
+
The backend is loaded only when chosen. The unused one does not have to be installed.
|
51 |
+
|
52 |
+
3) For voice activity controller: `pip install torch torchaudio`. Optional, but very recommended.
|
53 |
+
|
54 |
+
<details>
|
55 |
+
<summary>4) Optional, not recommended: sentence segmenter (aka sentence tokenizer)</summary>
|
56 |
+
|
57 |
+
Two buffer trimming options are integrated and evaluated. They have impact on
|
58 |
+
the quality and latency. The default "segment" option performs better according
|
59 |
+
to our tests and does not require any sentence segmentation installed.
|
60 |
+
|
61 |
+
The other option, "sentence" -- trimming at the end of confirmed sentences,
|
62 |
+
requires sentence segmenter installed. It splits punctuated text to sentences by full
|
63 |
+
stops, avoiding the dots that are not full stops. The segmenters are language
|
64 |
+
specific. The unused one does not have to be installed. We integrate the
|
65 |
+
following segmenters, but suggestions for better alternatives are welcome.
|
66 |
+
|
67 |
+
- `pip install opus-fast-mosestokenizer` for the languages with codes `as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh`
|
68 |
+
|
69 |
+
- `pip install tokenize_uk` for Ukrainian -- `uk`
|
70 |
+
|
71 |
+
- for other languages, we integrate a good performing multi-lingual model of `wtpslit`. It requires `pip install torch wtpsplit`, and its neural model `wtp-canine-s-12l-no-adapters`. It is downloaded to the default huggingface cache during the first use.
|
72 |
+
|
73 |
+
- we did not find a segmenter for languages `as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt` that are supported by Whisper and not by wtpsplit. The default fallback option for them is wtpsplit with unspecified language. Alternative suggestions welcome.
|
74 |
+
|
75 |
+
In case of installation issues of opus-fast-mosestokenizer, especially on Windows and Mac, we recommend using only the "segment" option that does not require it.
|
76 |
+
</details>
|
77 |
+
|
78 |
+
## Usage
|
79 |
+
|
80 |
+
### Real-time simulation from audio file
|
81 |
+
|
82 |
+
```
|
83 |
+
whisper_online.py -h
|
84 |
+
usage: whisper_online.py [-h] [--min-chunk-size MIN_CHUNK_SIZE] [--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large}] [--model_cache_dir MODEL_CACHE_DIR]
|
85 |
+
[--model_dir MODEL_DIR] [--lan LAN] [--task {transcribe,translate}] [--backend {faster-whisper,whisper_timestamped,openai-api}] [--vac] [--vac-chunk-size VAC_CHUNK_SIZE] [--vad]
|
86 |
+
[--buffer_trimming {sentence,segment}] [--buffer_trimming_sec BUFFER_TRIMMING_SEC] [-l {DEBUG,INFO,WARNING,ERROR,CRITICAL}] [--start_at START_AT] [--offline] [--comp_unaware]
|
87 |
+
audio_path
|
88 |
+
|
89 |
+
positional arguments:
|
90 |
+
audio_path Filename of 16kHz mono channel wav, on which live streaming is simulated.
|
91 |
+
|
92 |
+
options:
|
93 |
+
-h, --help show this help message and exit
|
94 |
+
--min-chunk-size MIN_CHUNK_SIZE
|
95 |
+
Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was
|
96 |
+
received by this time.
|
97 |
+
--model {tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large}
|
98 |
+
Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.
|
99 |
+
--model_cache_dir MODEL_CACHE_DIR
|
100 |
+
Overriding the default model cache dir where models downloaded from the hub are saved
|
101 |
+
--model_dir MODEL_DIR
|
102 |
+
Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.
|
103 |
+
--lan LAN, --language LAN
|
104 |
+
Source language code, e.g. en,de,cs, or 'auto' for language detection.
|
105 |
+
--task {transcribe,translate}
|
106 |
+
Transcribe or translate.
|
107 |
+
--backend {faster-whisper,whisper_timestamped,openai-api}
|
108 |
+
Load only this backend for Whisper processing.
|
109 |
+
--vac Use VAC = voice activity controller. Recommended. Requires torch.
|
110 |
+
--vac-chunk-size VAC_CHUNK_SIZE
|
111 |
+
VAC sample size in seconds.
|
112 |
+
--vad Use VAD = voice activity detection, with the default parameters.
|
113 |
+
--buffer_trimming {sentence,segment}
|
114 |
+
Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter
|
115 |
+
must be installed for "sentence" option.
|
116 |
+
--buffer_trimming_sec BUFFER_TRIMMING_SEC
|
117 |
+
Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.
|
118 |
+
-l {DEBUG,INFO,WARNING,ERROR,CRITICAL}, --log-level {DEBUG,INFO,WARNING,ERROR,CRITICAL}
|
119 |
+
Set the log level
|
120 |
+
--start_at START_AT Start processing audio at this time.
|
121 |
+
--offline Offline mode.
|
122 |
+
--comp_unaware Computationally unaware simulation.
|
123 |
+
```
|
124 |
+
|
125 |
+
Example:
|
126 |
+
|
127 |
+
It simulates realtime processing from a pre-recorded mono 16k wav file.
|
128 |
+
|
129 |
+
```
|
130 |
+
python3 whisper_online.py en-demo16.wav --language en --min-chunk-size 1 > out.txt
|
131 |
+
```
|
132 |
+
|
133 |
+
Simulation modes:
|
134 |
+
|
135 |
+
- default mode, no special option: real-time simulation from file, computationally aware. The chunk size is `MIN_CHUNK_SIZE` or larger, if more audio arrived during last update computation.
|
136 |
+
|
137 |
+
- `--comp_unaware` option: computationally unaware simulation. It means that the timer that counts the emission times "stops" when the model is computing. The chunk size is always `MIN_CHUNK_SIZE`. The latency is caused only by the model being unable to confirm the output, e.g. because of language ambiguity etc., and not because of slow hardware or suboptimal implementation. We implement this feature for finding the lower bound for latency.
|
138 |
+
|
139 |
+
- `--start_at START_AT`: Start processing audio at this time. The first update receives the whole audio by `START_AT`. It is useful for debugging, e.g. when we observe a bug in a specific time in audio file, and want to reproduce it quickly, without long waiting.
|
140 |
+
|
141 |
+
- `--offline` option: It processes the whole audio file at once, in offline mode. We implement it to find out the lowest possible WER on given audio file.
|
142 |
+
|
143 |
+
|
144 |
+
|
145 |
+
### Output format
|
146 |
+
|
147 |
+
```
|
148 |
+
2691.4399 300 1380 Chairman, thank you.
|
149 |
+
6914.5501 1940 4940 If the debate today had a
|
150 |
+
9019.0277 5160 7160 the subject the situation in
|
151 |
+
10065.1274 7180 7480 Gaza
|
152 |
+
11058.3558 7480 9460 Strip, I might
|
153 |
+
12224.3731 9460 9760 have
|
154 |
+
13555.1929 9760 11060 joined Mrs.
|
155 |
+
14928.5479 11140 12240 De Kaiser and all the
|
156 |
+
16588.0787 12240 12560 other
|
157 |
+
18324.9285 12560 14420 colleagues across the
|
158 |
+
```
|
159 |
+
|
160 |
+
[See description here](https://github.com/ufal/whisper_streaming/blob/d915d790a62d7be4e7392dde1480e7981eb142ae/whisper_online.py#L361)
|
161 |
+
|
162 |
+
### As a module
|
163 |
+
|
164 |
+
TL;DR: use OnlineASRProcessor object and its methods insert_audio_chunk and process_iter.
|
165 |
+
|
166 |
+
The code whisper_online.py is nicely commented, read it as the full documentation.
|
167 |
+
|
168 |
+
|
169 |
+
This pseudocode describes the interface that we suggest for your implementation. You can implement any features that you need for your application.
|
170 |
+
|
171 |
+
```python
|
172 |
+
from whisper_online import *
|
173 |
+
|
174 |
+
src_lan = "en" # source language
|
175 |
+
tgt_lan = "en" # target language -- same as source for ASR, "en" if translate task is used
|
176 |
+
|
177 |
+
asr = FasterWhisperASR(lan, "large-v2") # loads and wraps Whisper model
|
178 |
+
# set options:
|
179 |
+
# asr.set_translate_task() # it will translate from lan into English
|
180 |
+
# asr.use_vad() # set using VAD
|
181 |
+
|
182 |
+
online = OnlineASRProcessor(asr) # create processing object with default buffer trimming option
|
183 |
+
|
184 |
+
while audio_has_not_ended: # processing loop:
|
185 |
+
a = # receive new audio chunk (and e.g. wait for min_chunk_size seconds first, ...)
|
186 |
+
online.insert_audio_chunk(a)
|
187 |
+
o = online.process_iter()
|
188 |
+
print(o) # do something with current partial output
|
189 |
+
# at the end of this audio processing
|
190 |
+
o = online.finish()
|
191 |
+
print(o) # do something with the last output
|
192 |
+
|
193 |
+
|
194 |
+
online.init() # refresh if you're going to re-use the object for the next audio
|
195 |
+
```
|
196 |
+
|
197 |
+
### Server -- real-time from mic
|
198 |
+
|
199 |
+
`whisper_online_server.py` has the same model options as `whisper_online.py`, plus `--host` and `--port` of the TCP connection and the `--warmup-file`. See the help message (`-h` option).
|
200 |
+
|
201 |
+
Client example:
|
202 |
+
|
203 |
+
```
|
204 |
+
arecord -f S16_LE -c1 -r 16000 -t raw -D default | nc localhost 43001
|
205 |
+
```
|
206 |
+
|
207 |
+
- arecord sends realtime audio from a sound device (e.g. mic), in raw audio format -- 16000 sampling rate, mono channel, S16\_LE -- signed 16-bit integer low endian. (use the alternative to arecord that works for you)
|
208 |
+
|
209 |
+
- nc is netcat with server's host and port
|
210 |
+
|
211 |
+
|
212 |
+
## Background
|
213 |
+
|
214 |
+
Default Whisper is intended for audio chunks of at most 30 seconds that contain
|
215 |
+
one full sentence. Longer audio files must be split to shorter chunks and
|
216 |
+
merged with "init prompt". In low latency simultaneous streaming mode, the
|
217 |
+
simple and naive chunking fixed-sized windows does not work well, it can split
|
218 |
+
a word in the middle. It is also necessary to know when the transcribt is
|
219 |
+
stable, should be confirmed ("commited") and followed up, and when the future
|
220 |
+
content makes the transcript clearer.
|
221 |
+
|
222 |
+
For that, there is LocalAgreement-n policy: if n consecutive updates, each with
|
223 |
+
a newly available audio stream chunk, agree on a prefix transcript, it is
|
224 |
+
confirmed. (Reference: CUNI-KIT at IWSLT 2022 etc.)
|
225 |
+
|
226 |
+
In this project, we re-use the idea of Peter Polák from this demo:
|
227 |
+
https://github.com/pe-trik/transformers/blob/online_decode/examples/pytorch/online-decoding/whisper-online-demo.py
|
228 |
+
However, it doesn't do any sentence segmentation, but Whisper produces
|
229 |
+
punctuation and the libraries `faster-whisper` and `whisper_transcribed` make
|
230 |
+
word-level timestamps. In short: we
|
231 |
+
consecutively process new audio chunks, emit the transcripts that are confirmed
|
232 |
+
by 2 iterations, and scroll the audio processing buffer on a timestamp of a
|
233 |
+
confirmed complete sentence. The processing audio buffer is not too long and
|
234 |
+
the processing is fast.
|
235 |
+
|
236 |
+
In more detail: we use the init prompt, we handle the inaccurate timestamps, we
|
237 |
+
re-process confirmed sentence prefixes and skip them, making sure they don't
|
238 |
+
overlap, and we limit the processing buffer window.
|
239 |
+
|
240 |
+
### Performance evaluation
|
241 |
+
|
242 |
+
[See the paper.](http://www.afnlp.org/conferences/ijcnlp2023/proceedings/main-demo/cdrom/pdf/2023.ijcnlp-demo.3.pdf)
|
243 |
+
|
244 |
+
### Contributions
|
245 |
+
|
246 |
+
Contributions are welcome. We acknowledge especially:
|
247 |
+
|
248 |
+
- [The GitHub contributors](https://github.com/ufal/whisper_streaming/graphs/contributors) for their pull requests with new features and bugfixes.
|
249 |
+
- [Nice explanation video](https://www.youtube.com/watch?v=_spinzpEeFM) -- published on 31st March 2024, note that newer updates are not included.
|
250 |
+
- [The translation of this repo into Chinese.](https://github.com/Gloridust/whisper_streaming_CN)
|
251 |
+
- [Ondřej Plátek](https://opla.cz/) for the paper pre-review.
|
252 |
+
|
253 |
+
Credits:
|
254 |
+
|
255 |
+
- [Peter Polák](https://ufal.mff.cuni.cz/peter-polak) for the original idea.
|
256 |
+
- The UEDIN team of the [ELITR project](https://elitr.eu) for the original line_packet.py.
|
257 |
+
- Silero Team for their VAD [model](https://github.com/snakers4/silero-vad) and [VADIterator](https://github.com/ufal/whisper_streaming/blob/47caa80588ee9c0fa8945a5d05f0aea6315eb837/silero_vad.py#L8).
|
258 |
+
|
259 |
+
|
260 |
+
## Contact
|
261 |
+
|
262 |
+
Dominik Macháček, machacek@ufal.mff.cuni.cz
|
263 |
+
|
libs/whisper_streaming/line_packet.py
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
|
3 |
+
"""Functions for sending and receiving individual lines of text over a socket.
|
4 |
+
|
5 |
+
A line is transmitted using one or more fixed-size packets of UTF-8 bytes
|
6 |
+
containing:
|
7 |
+
|
8 |
+
- Zero or more bytes of UTF-8, excluding \n and \0, followed by
|
9 |
+
|
10 |
+
- Zero or more \0 bytes as required to pad the packet to PACKET_SIZE
|
11 |
+
|
12 |
+
Originally from the UEDIN team of the ELITR project.
|
13 |
+
"""
|
14 |
+
|
15 |
+
PACKET_SIZE = 65536
|
16 |
+
|
17 |
+
|
18 |
+
def send_one_line(socket, text, pad_zeros=False):
|
19 |
+
"""Sends a line of text over the given socket.
|
20 |
+
|
21 |
+
The 'text' argument should contain a single line of text (line break
|
22 |
+
characters are optional). Line boundaries are determined by Python's
|
23 |
+
str.splitlines() function [1]. We also count '\0' as a line terminator.
|
24 |
+
If 'text' contains multiple lines then only the first will be sent.
|
25 |
+
|
26 |
+
If the send fails then an exception will be raised.
|
27 |
+
|
28 |
+
[1] https://docs.python.org/3.5/library/stdtypes.html#str.splitlines
|
29 |
+
|
30 |
+
Args:
|
31 |
+
socket: a socket object.
|
32 |
+
text: string containing a line of text for transmission.
|
33 |
+
"""
|
34 |
+
text.replace('\0', '\n')
|
35 |
+
lines = text.splitlines()
|
36 |
+
first_line = '' if len(lines) == 0 else lines[0]
|
37 |
+
# TODO Is there a better way of handling bad input than 'replace'?
|
38 |
+
data = first_line.encode('utf-8', errors='replace') + b'\n' + (b'\0' if pad_zeros else b'')
|
39 |
+
for offset in range(0, len(data), PACKET_SIZE):
|
40 |
+
bytes_remaining = len(data) - offset
|
41 |
+
if bytes_remaining < PACKET_SIZE:
|
42 |
+
padding_length = PACKET_SIZE - bytes_remaining
|
43 |
+
packet = data[offset:] + (b'\0' * padding_length if pad_zeros else b'')
|
44 |
+
else:
|
45 |
+
packet = data[offset:offset+PACKET_SIZE]
|
46 |
+
socket.sendall(packet)
|
47 |
+
|
48 |
+
|
49 |
+
def receive_one_line(socket):
|
50 |
+
"""Receives a line of text from the given socket.
|
51 |
+
|
52 |
+
This function will (attempt to) receive a single line of text. If data is
|
53 |
+
currently unavailable then it will block until data becomes available or
|
54 |
+
the sender has closed the connection (in which case it will return an
|
55 |
+
empty string).
|
56 |
+
|
57 |
+
The string should not contain any newline characters, but if it does then
|
58 |
+
only the first line will be returned.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
socket: a socket object.
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
A string representing a single line with a terminating newline or
|
65 |
+
None if the connection has been closed.
|
66 |
+
"""
|
67 |
+
data = b''
|
68 |
+
while True:
|
69 |
+
packet = socket.recv(PACKET_SIZE)
|
70 |
+
if not packet: # Connection has been closed.
|
71 |
+
return None
|
72 |
+
data += packet
|
73 |
+
if b'\0' in packet:
|
74 |
+
break
|
75 |
+
# TODO Is there a better way of handling bad input than 'replace'?
|
76 |
+
text = data.decode('utf-8', errors='replace').strip('\0')
|
77 |
+
lines = text.split('\n')
|
78 |
+
return lines[0] + '\n'
|
79 |
+
|
80 |
+
|
81 |
+
def receive_lines(socket):
|
82 |
+
try:
|
83 |
+
data = socket.recv(PACKET_SIZE)
|
84 |
+
except BlockingIOError:
|
85 |
+
return []
|
86 |
+
if data is None: # Connection has been closed.
|
87 |
+
return None
|
88 |
+
# TODO Is there a better way of handling bad input than 'replace'?
|
89 |
+
text = data.decode('utf-8', errors='replace').strip('\0')
|
90 |
+
lines = text.split('\n')
|
91 |
+
if len(lines)==1 and not lines[0]:
|
92 |
+
return None
|
93 |
+
return lines
|
libs/whisper_streaming/silero_vad.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
# This is copied from silero-vad's vad_utils.py:
|
4 |
+
# https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/utils_vad.py#L340
|
5 |
+
|
6 |
+
# Their licence is MIT, same as ours: https://github.com/snakers4/silero-vad/blob/f6b1294cb27590fb2452899df98fb234dfef1134/LICENSE
|
7 |
+
|
8 |
+
class VADIterator:
|
9 |
+
def __init__(self,
|
10 |
+
model,
|
11 |
+
threshold: float = 0.5,
|
12 |
+
sampling_rate: int = 16000,
|
13 |
+
min_silence_duration_ms: int = 100,
|
14 |
+
speech_pad_ms: int = 30
|
15 |
+
):
|
16 |
+
|
17 |
+
"""
|
18 |
+
Class for stream imitation
|
19 |
+
|
20 |
+
Parameters
|
21 |
+
----------
|
22 |
+
model: preloaded .jit silero VAD model
|
23 |
+
|
24 |
+
threshold: float (default - 0.5)
|
25 |
+
Speech threshold. Silero VAD outputs speech probabilities for each audio chunk, probabilities ABOVE this value are considered as SPEECH.
|
26 |
+
It is better to tune this parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
|
27 |
+
|
28 |
+
sampling_rate: int (default - 16000)
|
29 |
+
Currently silero VAD models support 8000 and 16000 sample rates
|
30 |
+
|
31 |
+
min_silence_duration_ms: int (default - 100 milliseconds)
|
32 |
+
In the end of each speech chunk wait for min_silence_duration_ms before separating it
|
33 |
+
|
34 |
+
speech_pad_ms: int (default - 30 milliseconds)
|
35 |
+
Final speech chunks are padded by speech_pad_ms each side
|
36 |
+
"""
|
37 |
+
|
38 |
+
self.model = model
|
39 |
+
self.threshold = threshold
|
40 |
+
self.sampling_rate = sampling_rate
|
41 |
+
|
42 |
+
if sampling_rate not in [8000, 16000]:
|
43 |
+
raise ValueError('VADIterator does not support sampling rates other than [8000, 16000]')
|
44 |
+
|
45 |
+
self.min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
|
46 |
+
self.speech_pad_samples = sampling_rate * speech_pad_ms / 1000
|
47 |
+
self.reset_states()
|
48 |
+
|
49 |
+
def reset_states(self):
|
50 |
+
|
51 |
+
self.model.reset_states()
|
52 |
+
self.triggered = False
|
53 |
+
self.temp_end = 0
|
54 |
+
self.current_sample = 0
|
55 |
+
|
56 |
+
def __call__(self, x, return_seconds=False):
|
57 |
+
"""
|
58 |
+
x: torch.Tensor
|
59 |
+
audio chunk (see examples in repo)
|
60 |
+
|
61 |
+
return_seconds: bool (default - False)
|
62 |
+
whether return timestamps in seconds (default - samples)
|
63 |
+
"""
|
64 |
+
|
65 |
+
if not torch.is_tensor(x):
|
66 |
+
try:
|
67 |
+
x = torch.Tensor(x)
|
68 |
+
except:
|
69 |
+
raise TypeError("Audio cannot be casted to tensor. Cast it manually")
|
70 |
+
|
71 |
+
window_size_samples = len(x[0]) if x.dim() == 2 else len(x)
|
72 |
+
self.current_sample += window_size_samples
|
73 |
+
|
74 |
+
speech_prob = self.model(x, self.sampling_rate).item()
|
75 |
+
|
76 |
+
if (speech_prob >= self.threshold) and self.temp_end:
|
77 |
+
self.temp_end = 0
|
78 |
+
|
79 |
+
if (speech_prob >= self.threshold) and not self.triggered:
|
80 |
+
self.triggered = True
|
81 |
+
speech_start = self.current_sample - self.speech_pad_samples
|
82 |
+
return {'start': int(speech_start) if not return_seconds else round(speech_start / self.sampling_rate, 1)}
|
83 |
+
|
84 |
+
if (speech_prob < self.threshold - 0.15) and self.triggered:
|
85 |
+
if not self.temp_end:
|
86 |
+
self.temp_end = self.current_sample
|
87 |
+
if self.current_sample - self.temp_end < self.min_silence_samples:
|
88 |
+
return None
|
89 |
+
else:
|
90 |
+
speech_end = self.temp_end + self.speech_pad_samples
|
91 |
+
self.temp_end = 0
|
92 |
+
self.triggered = False
|
93 |
+
return {'end': int(speech_end) if not return_seconds else round(speech_end / self.sampling_rate, 1)}
|
94 |
+
|
95 |
+
return None
|
96 |
+
|
97 |
+
|
libs/whisper_streaming/whisper_online.py
ADDED
@@ -0,0 +1,846 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
import sys
|
3 |
+
import numpy as np
|
4 |
+
import librosa
|
5 |
+
from functools import lru_cache
|
6 |
+
import time
|
7 |
+
import logging
|
8 |
+
|
9 |
+
import io
|
10 |
+
import soundfile as sf
|
11 |
+
import math
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
@lru_cache
|
16 |
+
def load_audio(fname):
|
17 |
+
a, _ = librosa.load(fname, sr=16000, dtype=np.float32)
|
18 |
+
return a
|
19 |
+
|
20 |
+
def load_audio_chunk(fname, beg, end):
|
21 |
+
audio = load_audio(fname)
|
22 |
+
beg_s = int(beg*16000)
|
23 |
+
end_s = int(end*16000)
|
24 |
+
return audio[beg_s:end_s]
|
25 |
+
|
26 |
+
|
27 |
+
# Whisper backend
|
28 |
+
|
29 |
+
class ASRBase:
|
30 |
+
|
31 |
+
sep = " " # join transcribe words with this character (" " for whisper_timestamped,
|
32 |
+
# "" for faster-whisper because it emits the spaces when neeeded)
|
33 |
+
|
34 |
+
def __init__(self, lan, modelsize=None, cache_dir=None, model_dir=None, logfile=sys.stderr):
|
35 |
+
self.logfile = logfile
|
36 |
+
|
37 |
+
self.transcribe_kargs = {}
|
38 |
+
if lan == "auto":
|
39 |
+
self.original_language = None
|
40 |
+
else:
|
41 |
+
self.original_language = lan
|
42 |
+
|
43 |
+
self.model = self.load_model(modelsize, cache_dir, model_dir)
|
44 |
+
|
45 |
+
|
46 |
+
def load_model(self, modelsize, cache_dir):
|
47 |
+
raise NotImplemented("must be implemented in the child class")
|
48 |
+
|
49 |
+
def transcribe(self, audio, init_prompt=""):
|
50 |
+
raise NotImplemented("must be implemented in the child class")
|
51 |
+
|
52 |
+
def use_vad(self):
|
53 |
+
raise NotImplemented("must be implemented in the child class")
|
54 |
+
|
55 |
+
|
56 |
+
class WhisperTimestampedASR(ASRBase):
|
57 |
+
"""Uses whisper_timestamped library as the backend. Initially, we tested the code on this backend. It worked, but slower than faster-whisper.
|
58 |
+
On the other hand, the installation for GPU could be easier.
|
59 |
+
"""
|
60 |
+
|
61 |
+
sep = " "
|
62 |
+
|
63 |
+
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
64 |
+
import whisper
|
65 |
+
import whisper_timestamped
|
66 |
+
from whisper_timestamped import transcribe_timestamped
|
67 |
+
self.transcribe_timestamped = transcribe_timestamped
|
68 |
+
if model_dir is not None:
|
69 |
+
logger.debug("ignoring model_dir, not implemented")
|
70 |
+
return whisper.load_model(modelsize, download_root=cache_dir)
|
71 |
+
|
72 |
+
def transcribe(self, audio, init_prompt=""):
|
73 |
+
result = self.transcribe_timestamped(self.model,
|
74 |
+
audio, language=self.original_language,
|
75 |
+
initial_prompt=init_prompt, verbose=None,
|
76 |
+
condition_on_previous_text=True, **self.transcribe_kargs)
|
77 |
+
return result
|
78 |
+
|
79 |
+
def ts_words(self,r):
|
80 |
+
# return: transcribe result object to [(beg,end,"word1"), ...]
|
81 |
+
o = []
|
82 |
+
for s in r["segments"]:
|
83 |
+
for w in s["words"]:
|
84 |
+
t = (w["start"],w["end"],w["text"])
|
85 |
+
o.append(t)
|
86 |
+
return o
|
87 |
+
|
88 |
+
def segments_end_ts(self, res):
|
89 |
+
return [s["end"] for s in res["segments"]]
|
90 |
+
|
91 |
+
def use_vad(self):
|
92 |
+
self.transcribe_kargs["vad"] = True
|
93 |
+
|
94 |
+
def set_translate_task(self):
|
95 |
+
self.transcribe_kargs["task"] = "translate"
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
class FasterWhisperASR(ASRBase):
|
101 |
+
"""Uses faster-whisper library as the backend. Works much faster, appx 4-times (in offline mode). For GPU, it requires installation with a specific CUDNN version.
|
102 |
+
"""
|
103 |
+
|
104 |
+
sep = ""
|
105 |
+
|
106 |
+
def load_model(self, modelsize=None, cache_dir=None, model_dir=None):
|
107 |
+
from faster_whisper import WhisperModel
|
108 |
+
# logging.getLogger("faster_whisper").setLevel(logger.level)
|
109 |
+
if model_dir is not None:
|
110 |
+
logger.debug(f"Loading whisper model from model_dir {model_dir}. modelsize and cache_dir parameters are not used.")
|
111 |
+
model_size_or_path = model_dir
|
112 |
+
elif modelsize is not None:
|
113 |
+
model_size_or_path = modelsize
|
114 |
+
else:
|
115 |
+
raise ValueError("modelsize or model_dir parameter must be set")
|
116 |
+
|
117 |
+
|
118 |
+
# this worked fast and reliably on NVIDIA L40
|
119 |
+
model = WhisperModel(model_size_or_path, device="cuda", compute_type="float16", download_root=cache_dir)
|
120 |
+
|
121 |
+
# or run on GPU with INT8
|
122 |
+
# tested: the transcripts were different, probably worse than with FP16, and it was slightly (appx 20%) slower
|
123 |
+
#model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
|
124 |
+
|
125 |
+
# or run on CPU with INT8
|
126 |
+
# tested: works, but slow, appx 10-times than cuda FP16
|
127 |
+
# model = WhisperModel(modelsize, device="cpu", compute_type="int8") #, download_root="faster-disk-cache-dir/")
|
128 |
+
return model
|
129 |
+
|
130 |
+
def transcribe(self, audio, init_prompt=""):
|
131 |
+
|
132 |
+
# tested: beam_size=5 is faster and better than 1 (on one 200 second document from En ESIC, min chunk 0.01)
|
133 |
+
segments, info = self.model.transcribe(audio, language=self.original_language, initial_prompt=init_prompt, beam_size=5, word_timestamps=True, condition_on_previous_text=True, **self.transcribe_kargs)
|
134 |
+
#print(info) # info contains language detection result
|
135 |
+
|
136 |
+
return list(segments)
|
137 |
+
|
138 |
+
def ts_words(self, segments):
|
139 |
+
o = []
|
140 |
+
for segment in segments:
|
141 |
+
for word in segment.words:
|
142 |
+
if segment.no_speech_prob > 0.9:
|
143 |
+
continue
|
144 |
+
# not stripping the spaces -- should not be merged with them!
|
145 |
+
w = word.word
|
146 |
+
t = (word.start, word.end, w)
|
147 |
+
o.append(t)
|
148 |
+
return o
|
149 |
+
|
150 |
+
def segments_end_ts(self, res):
|
151 |
+
return [s.end for s in res]
|
152 |
+
|
153 |
+
def use_vad(self):
|
154 |
+
self.transcribe_kargs["vad_filter"] = True
|
155 |
+
|
156 |
+
def set_translate_task(self):
|
157 |
+
self.transcribe_kargs["task"] = "translate"
|
158 |
+
|
159 |
+
|
160 |
+
class OpenaiApiASR(ASRBase):
|
161 |
+
"""Uses OpenAI's Whisper API for audio transcription."""
|
162 |
+
|
163 |
+
def __init__(self, lan=None, temperature=0, logfile=sys.stderr):
|
164 |
+
self.logfile = logfile
|
165 |
+
|
166 |
+
self.modelname = "whisper-1"
|
167 |
+
self.original_language = None if lan == "auto" else lan # ISO-639-1 language code
|
168 |
+
self.response_format = "verbose_json"
|
169 |
+
self.temperature = temperature
|
170 |
+
|
171 |
+
self.load_model()
|
172 |
+
|
173 |
+
self.use_vad_opt = False
|
174 |
+
|
175 |
+
# reset the task in set_translate_task
|
176 |
+
self.task = "transcribe"
|
177 |
+
|
178 |
+
def load_model(self, *args, **kwargs):
|
179 |
+
from openai import OpenAI
|
180 |
+
self.client = OpenAI()
|
181 |
+
|
182 |
+
self.transcribed_seconds = 0 # for logging how many seconds were processed by API, to know the cost
|
183 |
+
|
184 |
+
|
185 |
+
def ts_words(self, segments):
|
186 |
+
no_speech_segments = []
|
187 |
+
if self.use_vad_opt:
|
188 |
+
for segment in segments.segments:
|
189 |
+
# TODO: threshold can be set from outside
|
190 |
+
if segment["no_speech_prob"] > 0.8:
|
191 |
+
no_speech_segments.append((segment.get("start"), segment.get("end")))
|
192 |
+
|
193 |
+
o = []
|
194 |
+
for word in segments.words:
|
195 |
+
start = word.get("start")
|
196 |
+
end = word.get("end")
|
197 |
+
if any(s[0] <= start <= s[1] for s in no_speech_segments):
|
198 |
+
# print("Skipping word", word.get("word"), "because it's in a no-speech segment")
|
199 |
+
continue
|
200 |
+
o.append((start, end, word.get("word")))
|
201 |
+
return o
|
202 |
+
|
203 |
+
|
204 |
+
def segments_end_ts(self, res):
|
205 |
+
return [s["end"] for s in res.words]
|
206 |
+
|
207 |
+
def transcribe(self, audio_data, prompt=None, *args, **kwargs):
|
208 |
+
# Write the audio data to a buffer
|
209 |
+
buffer = io.BytesIO()
|
210 |
+
buffer.name = "temp.wav"
|
211 |
+
sf.write(buffer, audio_data, samplerate=16000, format='WAV', subtype='PCM_16')
|
212 |
+
buffer.seek(0) # Reset buffer's position to the beginning
|
213 |
+
|
214 |
+
self.transcribed_seconds += math.ceil(len(audio_data)/16000) # it rounds up to the whole seconds
|
215 |
+
|
216 |
+
params = {
|
217 |
+
"model": self.modelname,
|
218 |
+
"file": buffer,
|
219 |
+
"response_format": self.response_format,
|
220 |
+
"temperature": self.temperature,
|
221 |
+
"timestamp_granularities": ["word", "segment"]
|
222 |
+
}
|
223 |
+
if self.task != "translate" and self.original_language:
|
224 |
+
params["language"] = self.original_language
|
225 |
+
if prompt:
|
226 |
+
params["prompt"] = prompt
|
227 |
+
|
228 |
+
if self.task == "translate":
|
229 |
+
proc = self.client.audio.translations
|
230 |
+
else:
|
231 |
+
proc = self.client.audio.transcriptions
|
232 |
+
|
233 |
+
# Process transcription/translation
|
234 |
+
transcript = proc.create(**params)
|
235 |
+
logger.debug(f"OpenAI API processed accumulated {self.transcribed_seconds} seconds")
|
236 |
+
|
237 |
+
return transcript
|
238 |
+
|
239 |
+
def use_vad(self):
|
240 |
+
self.use_vad_opt = True
|
241 |
+
|
242 |
+
def set_translate_task(self):
|
243 |
+
self.task = "translate"
|
244 |
+
|
245 |
+
|
246 |
+
|
247 |
+
|
248 |
+
class HypothesisBuffer:
|
249 |
+
|
250 |
+
def __init__(self, logfile=sys.stderr):
|
251 |
+
self.commited_in_buffer = []
|
252 |
+
self.buffer = []
|
253 |
+
self.new = []
|
254 |
+
|
255 |
+
self.last_commited_time = 0
|
256 |
+
self.last_commited_word = None
|
257 |
+
|
258 |
+
self.logfile = logfile
|
259 |
+
|
260 |
+
def insert(self, new, offset):
|
261 |
+
# compare self.commited_in_buffer and new. It inserts only the words in new that extend the commited_in_buffer, it means they are roughly behind last_commited_time and new in content
|
262 |
+
# the new tail is added to self.new
|
263 |
+
|
264 |
+
new = [(a+offset,b+offset,t) for a,b,t in new]
|
265 |
+
self.new = [(a,b,t) for a,b,t in new if a > self.last_commited_time-0.1]
|
266 |
+
|
267 |
+
if len(self.new) >= 1:
|
268 |
+
a,b,t = self.new[0]
|
269 |
+
if abs(a - self.last_commited_time) < 1:
|
270 |
+
if self.commited_in_buffer:
|
271 |
+
# it's going to search for 1, 2, ..., 5 consecutive words (n-grams) that are identical in commited and new. If they are, they're dropped.
|
272 |
+
cn = len(self.commited_in_buffer)
|
273 |
+
nn = len(self.new)
|
274 |
+
for i in range(1,min(min(cn,nn),5)+1): # 5 is the maximum
|
275 |
+
c = " ".join([self.commited_in_buffer[-j][2] for j in range(1,i+1)][::-1])
|
276 |
+
tail = " ".join(self.new[j-1][2] for j in range(1,i+1))
|
277 |
+
if c == tail:
|
278 |
+
words = []
|
279 |
+
for j in range(i):
|
280 |
+
words.append(repr(self.new.pop(0)))
|
281 |
+
words_msg = " ".join(words)
|
282 |
+
logger.debug(f"removing last {i} words: {words_msg}")
|
283 |
+
break
|
284 |
+
|
285 |
+
def flush(self):
|
286 |
+
# returns commited chunk = the longest common prefix of 2 last inserts.
|
287 |
+
|
288 |
+
commit = []
|
289 |
+
while self.new:
|
290 |
+
na, nb, nt = self.new[0]
|
291 |
+
|
292 |
+
if len(self.buffer) == 0:
|
293 |
+
break
|
294 |
+
|
295 |
+
if nt == self.buffer[0][2]:
|
296 |
+
commit.append((na,nb,nt))
|
297 |
+
self.last_commited_word = nt
|
298 |
+
self.last_commited_time = nb
|
299 |
+
self.buffer.pop(0)
|
300 |
+
self.new.pop(0)
|
301 |
+
else:
|
302 |
+
break
|
303 |
+
self.buffer = self.new
|
304 |
+
self.new = []
|
305 |
+
self.commited_in_buffer.extend(commit)
|
306 |
+
return commit
|
307 |
+
|
308 |
+
def pop_commited(self, time):
|
309 |
+
while self.commited_in_buffer and self.commited_in_buffer[0][1] <= time:
|
310 |
+
self.commited_in_buffer.pop(0)
|
311 |
+
|
312 |
+
def complete(self):
|
313 |
+
return self.buffer
|
314 |
+
|
315 |
+
class OnlineASRProcessor:
|
316 |
+
|
317 |
+
SAMPLING_RATE = 16000
|
318 |
+
|
319 |
+
def __init__(self, asr, tokenizer=None, buffer_trimming=("segment", 15), logfile=sys.stderr):
|
320 |
+
"""asr: WhisperASR object
|
321 |
+
tokenizer: sentence tokenizer object for the target language. Must have a method *split* that behaves like the one of MosesTokenizer. It can be None, if "segment" buffer trimming option is used, then tokenizer is not used at all.
|
322 |
+
("segment", 15)
|
323 |
+
buffer_trimming: a pair of (option, seconds), where option is either "sentence" or "segment", and seconds is a number. Buffer is trimmed if it is longer than "seconds" threshold. Default is the most recommended option.
|
324 |
+
logfile: where to store the log.
|
325 |
+
"""
|
326 |
+
self.asr = asr
|
327 |
+
self.tokenizer = tokenizer
|
328 |
+
self.logfile = logfile
|
329 |
+
|
330 |
+
self.init()
|
331 |
+
|
332 |
+
self.buffer_trimming_way, self.buffer_trimming_sec = buffer_trimming
|
333 |
+
|
334 |
+
def init(self, offset=None):
|
335 |
+
"""run this when starting or restarting processing"""
|
336 |
+
self.audio_buffer = np.array([],dtype=np.float32)
|
337 |
+
self.transcript_buffer = HypothesisBuffer(logfile=self.logfile)
|
338 |
+
self.buffer_time_offset = 0
|
339 |
+
if offset is not None:
|
340 |
+
self.buffer_time_offset = offset
|
341 |
+
self.transcript_buffer.last_commited_time = self.buffer_time_offset
|
342 |
+
self.commited = []
|
343 |
+
|
344 |
+
def insert_audio_chunk(self, audio):
|
345 |
+
self.audio_buffer = np.append(self.audio_buffer, audio)
|
346 |
+
|
347 |
+
def prompt(self):
|
348 |
+
"""Returns a tuple: (prompt, context), where "prompt" is a 200-character suffix of commited text that is inside of the scrolled away part of audio buffer.
|
349 |
+
"context" is the commited text that is inside the audio buffer. It is transcribed again and skipped. It is returned only for debugging and logging reasons.
|
350 |
+
"""
|
351 |
+
k = max(0,len(self.commited)-1)
|
352 |
+
while k > 0 and self.commited[k-1][1] > self.buffer_time_offset:
|
353 |
+
k -= 1
|
354 |
+
|
355 |
+
p = self.commited[:k]
|
356 |
+
p = [t for _,_,t in p]
|
357 |
+
prompt = []
|
358 |
+
l = 0
|
359 |
+
while p and l < 200: # 200 characters prompt size
|
360 |
+
x = p.pop(-1)
|
361 |
+
l += len(x)+1
|
362 |
+
prompt.append(x)
|
363 |
+
non_prompt = self.commited[k:]
|
364 |
+
return self.asr.sep.join(prompt[::-1]), self.asr.sep.join(t for _,_,t in non_prompt)
|
365 |
+
|
366 |
+
def process_iter(self):
|
367 |
+
"""Runs on the current audio buffer.
|
368 |
+
Returns: a tuple (beg_timestamp, end_timestamp, "text"), or (None, None, "").
|
369 |
+
The non-emty text is confirmed (committed) partial transcript.
|
370 |
+
"""
|
371 |
+
|
372 |
+
prompt, non_prompt = self.prompt()
|
373 |
+
logger.debug(f"PROMPT: {prompt}")
|
374 |
+
logger.debug(f"CONTEXT: {non_prompt}")
|
375 |
+
logger.debug(f"transcribing {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f} seconds from {self.buffer_time_offset:2.2f}")
|
376 |
+
res = self.asr.transcribe(self.audio_buffer, init_prompt=prompt)
|
377 |
+
|
378 |
+
# transform to [(beg,end,"word1"), ...]
|
379 |
+
tsw = self.asr.ts_words(res)
|
380 |
+
|
381 |
+
self.transcript_buffer.insert(tsw, self.buffer_time_offset)
|
382 |
+
o = self.transcript_buffer.flush()
|
383 |
+
self.commited.extend(o)
|
384 |
+
completed = self.to_flush(o)
|
385 |
+
logger.debug(f">>>>COMPLETE NOW: {completed}")
|
386 |
+
the_rest = self.to_flush(self.transcript_buffer.complete())
|
387 |
+
logger.debug(f"INCOMPLETE: {the_rest}")
|
388 |
+
|
389 |
+
# there is a newly confirmed text
|
390 |
+
|
391 |
+
if o and self.buffer_trimming_way == "sentence": # trim the completed sentences
|
392 |
+
if len(self.audio_buffer)/self.SAMPLING_RATE > self.buffer_trimming_sec: # longer than this
|
393 |
+
self.chunk_completed_sentence()
|
394 |
+
|
395 |
+
|
396 |
+
if self.buffer_trimming_way == "segment":
|
397 |
+
s = self.buffer_trimming_sec # trim the completed segments longer than s,
|
398 |
+
else:
|
399 |
+
s = 30 # if the audio buffer is longer than 30s, trim it
|
400 |
+
|
401 |
+
if len(self.audio_buffer)/self.SAMPLING_RATE > s:
|
402 |
+
self.chunk_completed_segment(res)
|
403 |
+
|
404 |
+
# alternative: on any word
|
405 |
+
#l = self.buffer_time_offset + len(self.audio_buffer)/self.SAMPLING_RATE - 10
|
406 |
+
# let's find commited word that is less
|
407 |
+
#k = len(self.commited)-1
|
408 |
+
#while k>0 and self.commited[k][1] > l:
|
409 |
+
# k -= 1
|
410 |
+
#t = self.commited[k][1]
|
411 |
+
logger.debug("chunking segment")
|
412 |
+
#self.chunk_at(t)
|
413 |
+
|
414 |
+
logger.debug(f"len of buffer now: {len(self.audio_buffer)/self.SAMPLING_RATE:2.2f}")
|
415 |
+
return self.to_flush(o)
|
416 |
+
|
417 |
+
def chunk_completed_sentence(self):
|
418 |
+
if self.commited == []: return
|
419 |
+
logger.debug(self.commited)
|
420 |
+
sents = self.words_to_sentences(self.commited)
|
421 |
+
for s in sents:
|
422 |
+
logger.debug(f"\t\tSENT: {s}")
|
423 |
+
if len(sents) < 2:
|
424 |
+
return
|
425 |
+
while len(sents) > 2:
|
426 |
+
sents.pop(0)
|
427 |
+
# we will continue with audio processing at this timestamp
|
428 |
+
chunk_at = sents[-2][1]
|
429 |
+
|
430 |
+
logger.debug(f"--- sentence chunked at {chunk_at:2.2f}")
|
431 |
+
self.chunk_at(chunk_at)
|
432 |
+
|
433 |
+
def chunk_completed_segment(self, res):
|
434 |
+
if self.commited == []: return
|
435 |
+
|
436 |
+
ends = self.asr.segments_end_ts(res)
|
437 |
+
|
438 |
+
t = self.commited[-1][1]
|
439 |
+
|
440 |
+
if len(ends) > 1:
|
441 |
+
|
442 |
+
e = ends[-2]+self.buffer_time_offset
|
443 |
+
while len(ends) > 2 and e > t:
|
444 |
+
ends.pop(-1)
|
445 |
+
e = ends[-2]+self.buffer_time_offset
|
446 |
+
if e <= t:
|
447 |
+
logger.debug(f"--- segment chunked at {e:2.2f}")
|
448 |
+
self.chunk_at(e)
|
449 |
+
else:
|
450 |
+
logger.debug(f"--- last segment not within commited area")
|
451 |
+
else:
|
452 |
+
logger.debug(f"--- not enough segments to chunk")
|
453 |
+
|
454 |
+
|
455 |
+
|
456 |
+
|
457 |
+
|
458 |
+
def chunk_at(self, time):
|
459 |
+
"""trims the hypothesis and audio buffer at "time"
|
460 |
+
"""
|
461 |
+
self.transcript_buffer.pop_commited(time)
|
462 |
+
cut_seconds = time - self.buffer_time_offset
|
463 |
+
self.audio_buffer = self.audio_buffer[int(cut_seconds*self.SAMPLING_RATE):]
|
464 |
+
self.buffer_time_offset = time
|
465 |
+
|
466 |
+
def words_to_sentences(self, words):
|
467 |
+
"""Uses self.tokenizer for sentence segmentation of words.
|
468 |
+
Returns: [(beg,end,"sentence 1"),...]
|
469 |
+
"""
|
470 |
+
|
471 |
+
cwords = [w for w in words]
|
472 |
+
t = " ".join(o[2] for o in cwords)
|
473 |
+
s = self.tokenizer.split(t)
|
474 |
+
out = []
|
475 |
+
while s:
|
476 |
+
beg = None
|
477 |
+
end = None
|
478 |
+
sent = s.pop(0).strip()
|
479 |
+
fsent = sent
|
480 |
+
while cwords:
|
481 |
+
b,e,w = cwords.pop(0)
|
482 |
+
w = w.strip()
|
483 |
+
if beg is None and sent.startswith(w):
|
484 |
+
beg = b
|
485 |
+
elif end is None and sent == w:
|
486 |
+
end = e
|
487 |
+
out.append((beg,end,fsent))
|
488 |
+
break
|
489 |
+
sent = sent[len(w):].strip()
|
490 |
+
return out
|
491 |
+
|
492 |
+
def finish(self):
|
493 |
+
"""Flush the incomplete text when the whole processing ends.
|
494 |
+
Returns: the same format as self.process_iter()
|
495 |
+
"""
|
496 |
+
o = self.transcript_buffer.complete()
|
497 |
+
f = self.to_flush(o)
|
498 |
+
logger.debug(f"last, noncommited: {f}")
|
499 |
+
self.buffer_time_offset += len(self.audio_buffer)/16000
|
500 |
+
return f
|
501 |
+
|
502 |
+
|
503 |
+
def to_flush(self, sents, sep=None, offset=0, ):
|
504 |
+
# concatenates the timestamped words or sentences into one sequence that is flushed in one line
|
505 |
+
# sents: [(beg1, end1, "sentence1"), ...] or [] if empty
|
506 |
+
# return: (beg1,end-of-last-sentence,"concatenation of sentences") or (None, None, "") if empty
|
507 |
+
if sep is None:
|
508 |
+
sep = self.asr.sep
|
509 |
+
t = sep.join(s[2] for s in sents)
|
510 |
+
if len(sents) == 0:
|
511 |
+
b = None
|
512 |
+
e = None
|
513 |
+
else:
|
514 |
+
b = offset + sents[0][0]
|
515 |
+
e = offset + sents[-1][1]
|
516 |
+
return (b,e,t)
|
517 |
+
|
518 |
+
class VACOnlineASRProcessor(OnlineASRProcessor):
|
519 |
+
'''Wraps OnlineASRProcessor with VAC (Voice Activity Controller).
|
520 |
+
|
521 |
+
It works the same way as OnlineASRProcessor: it receives chunks of audio (e.g. 0.04 seconds),
|
522 |
+
it runs VAD and continuously detects whether there is speech or not.
|
523 |
+
When it detects end of speech (non-voice for 500ms), it makes OnlineASRProcessor to end the utterance immediately.
|
524 |
+
'''
|
525 |
+
|
526 |
+
def __init__(self, online_chunk_size, *a, **kw):
|
527 |
+
self.online_chunk_size = online_chunk_size
|
528 |
+
|
529 |
+
self.online = OnlineASRProcessor(*a, **kw)
|
530 |
+
|
531 |
+
# VAC:
|
532 |
+
import torch
|
533 |
+
model, _ = torch.hub.load(
|
534 |
+
repo_or_dir='snakers4/silero-vad:v4.0',
|
535 |
+
model='silero_vad'
|
536 |
+
)
|
537 |
+
from silero_vad import VADIterator
|
538 |
+
self.vac = VADIterator(model) # we use all the default options: 500ms silence, etc.
|
539 |
+
|
540 |
+
self.logfile = self.online.logfile
|
541 |
+
self.init()
|
542 |
+
|
543 |
+
def init(self):
|
544 |
+
self.online.init()
|
545 |
+
self.vac.reset_states()
|
546 |
+
self.current_online_chunk_buffer_size = 0
|
547 |
+
|
548 |
+
self.is_currently_final = False
|
549 |
+
|
550 |
+
self.status = None # or "voice" or "nonvoice"
|
551 |
+
self.audio_buffer = np.array([],dtype=np.float32)
|
552 |
+
self.buffer_offset = 0 # in frames
|
553 |
+
|
554 |
+
def clear_buffer(self):
|
555 |
+
self.buffer_offset += len(self.audio_buffer)
|
556 |
+
self.audio_buffer = np.array([],dtype=np.float32)
|
557 |
+
|
558 |
+
|
559 |
+
def insert_audio_chunk(self, audio):
|
560 |
+
res = self.vac(audio)
|
561 |
+
self.audio_buffer = np.append(self.audio_buffer, audio)
|
562 |
+
|
563 |
+
if res is not None:
|
564 |
+
frame = list(res.values())[0]
|
565 |
+
if 'start' in res and 'end' not in res:
|
566 |
+
self.status = 'voice'
|
567 |
+
send_audio = self.audio_buffer[frame-self.buffer_offset:]
|
568 |
+
self.online.init(offset=frame/self.SAMPLING_RATE)
|
569 |
+
self.online.insert_audio_chunk(send_audio)
|
570 |
+
self.current_online_chunk_buffer_size += len(send_audio)
|
571 |
+
self.clear_buffer()
|
572 |
+
elif 'end' in res and 'start' not in res:
|
573 |
+
self.status = 'nonvoice'
|
574 |
+
send_audio = self.audio_buffer[:frame-self.buffer_offset]
|
575 |
+
self.online.insert_audio_chunk(send_audio)
|
576 |
+
self.current_online_chunk_buffer_size += len(send_audio)
|
577 |
+
self.is_currently_final = True
|
578 |
+
self.clear_buffer()
|
579 |
+
else:
|
580 |
+
# It doesn't happen in the current code.
|
581 |
+
raise NotImplemented("both start and end of voice in one chunk!!!")
|
582 |
+
else:
|
583 |
+
if self.status == 'voice':
|
584 |
+
self.online.insert_audio_chunk(self.audio_buffer)
|
585 |
+
self.current_online_chunk_buffer_size += len(self.audio_buffer)
|
586 |
+
self.clear_buffer()
|
587 |
+
else:
|
588 |
+
# We keep 1 second because VAD may later find start of voice in it.
|
589 |
+
# But we trim it to prevent OOM.
|
590 |
+
self.buffer_offset += max(0,len(self.audio_buffer)-self.SAMPLING_RATE)
|
591 |
+
self.audio_buffer = self.audio_buffer[-self.SAMPLING_RATE:]
|
592 |
+
|
593 |
+
|
594 |
+
def process_iter(self):
|
595 |
+
if self.is_currently_final:
|
596 |
+
return self.finish()
|
597 |
+
elif self.current_online_chunk_buffer_size > self.SAMPLING_RATE*self.online_chunk_size:
|
598 |
+
self.current_online_chunk_buffer_size = 0
|
599 |
+
ret = self.online.process_iter()
|
600 |
+
return ret
|
601 |
+
else:
|
602 |
+
print("no online update, only VAD", self.status, file=self.logfile)
|
603 |
+
return (None, None, "")
|
604 |
+
|
605 |
+
def finish(self):
|
606 |
+
ret = self.online.finish()
|
607 |
+
self.current_online_chunk_buffer_size = 0
|
608 |
+
self.is_currently_final = False
|
609 |
+
return ret
|
610 |
+
|
611 |
+
|
612 |
+
|
613 |
+
WHISPER_LANG_CODES = "af,am,ar,as,az,ba,be,bg,bn,bo,br,bs,ca,cs,cy,da,de,el,en,es,et,eu,fa,fi,fo,fr,gl,gu,ha,haw,he,hi,hr,ht,hu,hy,id,is,it,ja,jw,ka,kk,km,kn,ko,la,lb,ln,lo,lt,lv,mg,mi,mk,ml,mn,mr,ms,mt,my,ne,nl,nn,no,oc,pa,pl,ps,pt,ro,ru,sa,sd,si,sk,sl,sn,so,sq,sr,su,sv,sw,ta,te,tg,th,tk,tl,tr,tt,uk,ur,uz,vi,yi,yo,zh".split(",")
|
614 |
+
|
615 |
+
def create_tokenizer(lan):
|
616 |
+
"""returns an object that has split function that works like the one of MosesTokenizer"""
|
617 |
+
|
618 |
+
assert lan in WHISPER_LANG_CODES, "language must be Whisper's supported lang code: " + " ".join(WHISPER_LANG_CODES)
|
619 |
+
|
620 |
+
if lan == "uk":
|
621 |
+
import tokenize_uk
|
622 |
+
class UkrainianTokenizer:
|
623 |
+
def split(self, text):
|
624 |
+
return tokenize_uk.tokenize_sents(text)
|
625 |
+
return UkrainianTokenizer()
|
626 |
+
|
627 |
+
# supported by fast-mosestokenizer
|
628 |
+
if lan in "as bn ca cs de el en es et fi fr ga gu hi hu is it kn lt lv ml mni mr nl or pa pl pt ro ru sk sl sv ta te yue zh".split():
|
629 |
+
from mosestokenizer import MosesTokenizer
|
630 |
+
return MosesTokenizer(lan)
|
631 |
+
|
632 |
+
# the following languages are in Whisper, but not in wtpsplit:
|
633 |
+
if lan in "as ba bo br bs fo haw hr ht jw lb ln lo mi nn oc sa sd sn so su sw tk tl tt".split():
|
634 |
+
logger.debug(f"{lan} code is not supported by wtpsplit. Going to use None lang_code option.")
|
635 |
+
lan = None
|
636 |
+
|
637 |
+
from wtpsplit import WtP
|
638 |
+
# downloads the model from huggingface on the first use
|
639 |
+
wtp = WtP("wtp-canine-s-12l-no-adapters")
|
640 |
+
class WtPtok:
|
641 |
+
def split(self, sent):
|
642 |
+
return wtp.split(sent, lang_code=lan)
|
643 |
+
return WtPtok()
|
644 |
+
|
645 |
+
|
646 |
+
def add_shared_args(parser):
|
647 |
+
"""shared args for simulation (this entry point) and server
|
648 |
+
parser: argparse.ArgumentParser object
|
649 |
+
"""
|
650 |
+
parser.add_argument('--min-chunk-size', type=float, default=1.0, help='Minimum audio chunk size in seconds. It waits up to this time to do processing. If the processing takes shorter time, it waits, otherwise it processes the whole segment that was received by this time.')
|
651 |
+
parser.add_argument('--model', type=str, default='large-v2', choices="tiny.en,tiny,base.en,base,small.en,small,medium.en,medium,large-v1,large-v2,large-v3,large".split(","),help="Name size of the Whisper model to use (default: large-v2). The model is automatically downloaded from the model hub if not present in model cache dir.")
|
652 |
+
parser.add_argument('--model_cache_dir', type=str, default=None, help="Overriding the default model cache dir where models downloaded from the hub are saved")
|
653 |
+
parser.add_argument('--model_dir', type=str, default=None, help="Dir where Whisper model.bin and other files are saved. This option overrides --model and --model_cache_dir parameter.")
|
654 |
+
parser.add_argument('--lan', '--language', type=str, default='auto', help="Source language code, e.g. en,de,cs, or 'auto' for language detection.")
|
655 |
+
parser.add_argument('--task', type=str, default='transcribe', choices=["transcribe","translate"],help="Transcribe or translate.")
|
656 |
+
parser.add_argument('--backend', type=str, default="faster-whisper", choices=["faster-whisper", "whisper_timestamped", "openai-api"],help='Load only this backend for Whisper processing.')
|
657 |
+
parser.add_argument('--vac', action="store_true", default=False, help='Use VAC = voice activity controller. Recommended. Requires torch.')
|
658 |
+
parser.add_argument('--vac-chunk-size', type=float, default=0.04, help='VAC sample size in seconds.')
|
659 |
+
parser.add_argument('--vad', action="store_true", default=False, help='Use VAD = voice activity detection, with the default parameters.')
|
660 |
+
parser.add_argument('--buffer_trimming', type=str, default="segment", choices=["sentence", "segment"],help='Buffer trimming strategy -- trim completed sentences marked with punctuation mark and detected by sentence segmenter, or the completed segments returned by Whisper. Sentence segmenter must be installed for "sentence" option.')
|
661 |
+
parser.add_argument('--buffer_trimming_sec', type=float, default=15, help='Buffer trimming length threshold in seconds. If buffer length is longer, trimming sentence/segment is triggered.')
|
662 |
+
parser.add_argument("-l", "--log-level", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the log level", default='DEBUG')
|
663 |
+
|
664 |
+
def asr_factory(args, logfile=sys.stderr):
|
665 |
+
"""
|
666 |
+
Creates and configures an ASR and ASR Online instance based on the specified backend and arguments.
|
667 |
+
"""
|
668 |
+
backend = args.backend
|
669 |
+
if backend == "openai-api":
|
670 |
+
logger.debug("Using OpenAI API.")
|
671 |
+
asr = OpenaiApiASR(lan=args.lan)
|
672 |
+
else:
|
673 |
+
if backend == "faster-whisper":
|
674 |
+
asr_cls = FasterWhisperASR
|
675 |
+
else:
|
676 |
+
asr_cls = WhisperTimestampedASR
|
677 |
+
|
678 |
+
# Only for FasterWhisperASR and WhisperTimestampedASR
|
679 |
+
size = args.model
|
680 |
+
t = time.time()
|
681 |
+
logger.info(f"Loading Whisper {size} model for {args.lan}...")
|
682 |
+
asr = asr_cls(modelsize=size, lan=args.lan, cache_dir=args.model_cache_dir, model_dir=args.model_dir)
|
683 |
+
e = time.time()
|
684 |
+
logger.info(f"done. It took {round(e-t,2)} seconds.")
|
685 |
+
|
686 |
+
# Apply common configurations
|
687 |
+
if getattr(args, 'vad', False): # Checks if VAD argument is present and True
|
688 |
+
logger.info("Setting VAD filter")
|
689 |
+
asr.use_vad()
|
690 |
+
|
691 |
+
language = args.lan
|
692 |
+
if args.task == "translate":
|
693 |
+
asr.set_translate_task()
|
694 |
+
tgt_language = "en" # Whisper translates into English
|
695 |
+
else:
|
696 |
+
tgt_language = language # Whisper transcribes in this language
|
697 |
+
|
698 |
+
# Create the tokenizer
|
699 |
+
if args.buffer_trimming == "sentence":
|
700 |
+
tokenizer = create_tokenizer(tgt_language)
|
701 |
+
else:
|
702 |
+
tokenizer = None
|
703 |
+
|
704 |
+
# Create the OnlineASRProcessor
|
705 |
+
if args.vac:
|
706 |
+
|
707 |
+
online = VACOnlineASRProcessor(args.min_chunk_size, asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
708 |
+
else:
|
709 |
+
online = OnlineASRProcessor(asr,tokenizer,logfile=logfile,buffer_trimming=(args.buffer_trimming, args.buffer_trimming_sec))
|
710 |
+
|
711 |
+
return asr, online
|
712 |
+
|
713 |
+
def set_logging(args,logger,other="_server"):
|
714 |
+
logging.basicConfig(#format='%(name)s
|
715 |
+
format='%(levelname)s\t%(message)s')
|
716 |
+
logger.setLevel(args.log_level)
|
717 |
+
logging.getLogger("whisper_online"+other).setLevel(args.log_level)
|
718 |
+
# logging.getLogger("whisper_online_server").setLevel(args.log_level)
|
719 |
+
|
720 |
+
|
721 |
+
|
722 |
+
if __name__ == "__main__":
|
723 |
+
|
724 |
+
import argparse
|
725 |
+
parser = argparse.ArgumentParser()
|
726 |
+
parser.add_argument('audio_path', type=str, help="Filename of 16kHz mono channel wav, on which live streaming is simulated.")
|
727 |
+
add_shared_args(parser)
|
728 |
+
parser.add_argument('--start_at', type=float, default=0.0, help='Start processing audio at this time.')
|
729 |
+
parser.add_argument('--offline', action="store_true", default=False, help='Offline mode.')
|
730 |
+
parser.add_argument('--comp_unaware', action="store_true", default=False, help='Computationally unaware simulation.')
|
731 |
+
|
732 |
+
args = parser.parse_args()
|
733 |
+
|
734 |
+
# reset to store stderr to different file stream, e.g. open(os.devnull,"w")
|
735 |
+
logfile = sys.stderr
|
736 |
+
|
737 |
+
if args.offline and args.comp_unaware:
|
738 |
+
logger.error("No or one option from --offline and --comp_unaware are available, not both. Exiting.")
|
739 |
+
sys.exit(1)
|
740 |
+
|
741 |
+
# if args.log_level:
|
742 |
+
# logging.basicConfig(format='whisper-%(levelname)s:%(name)s: %(message)s',
|
743 |
+
# level=getattr(logging, args.log_level))
|
744 |
+
|
745 |
+
set_logging(args,logger)
|
746 |
+
|
747 |
+
audio_path = args.audio_path
|
748 |
+
|
749 |
+
SAMPLING_RATE = 16000
|
750 |
+
duration = len(load_audio(audio_path))/SAMPLING_RATE
|
751 |
+
logger.info("Audio duration is: %2.2f seconds" % duration)
|
752 |
+
|
753 |
+
asr, online = asr_factory(args, logfile=logfile)
|
754 |
+
if args.vac:
|
755 |
+
min_chunk = args.vac_chunk_size
|
756 |
+
else:
|
757 |
+
min_chunk = args.min_chunk_size
|
758 |
+
|
759 |
+
# load the audio into the LRU cache before we start the timer
|
760 |
+
a = load_audio_chunk(audio_path,0,1)
|
761 |
+
|
762 |
+
# warm up the ASR because the very first transcribe takes much more time than the other
|
763 |
+
asr.transcribe(a)
|
764 |
+
|
765 |
+
beg = args.start_at
|
766 |
+
start = time.time()-beg
|
767 |
+
|
768 |
+
def output_transcript(o, now=None):
|
769 |
+
# output format in stdout is like:
|
770 |
+
# 4186.3606 0 1720 Takhle to je
|
771 |
+
# - the first three words are:
|
772 |
+
# - emission time from beginning of processing, in milliseconds
|
773 |
+
# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
|
774 |
+
# - the next words: segment transcript
|
775 |
+
if now is None:
|
776 |
+
now = time.time()-start
|
777 |
+
if o[0] is not None:
|
778 |
+
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),file=logfile,flush=True)
|
779 |
+
print("%1.4f %1.0f %1.0f %s" % (now*1000, o[0]*1000,o[1]*1000,o[2]),flush=True)
|
780 |
+
else:
|
781 |
+
# No text, so no output
|
782 |
+
pass
|
783 |
+
|
784 |
+
if args.offline: ## offline mode processing (for testing/debugging)
|
785 |
+
a = load_audio(audio_path)
|
786 |
+
online.insert_audio_chunk(a)
|
787 |
+
try:
|
788 |
+
o = online.process_iter()
|
789 |
+
except AssertionError as e:
|
790 |
+
logger.error(f"assertion error: {repr(e)}")
|
791 |
+
else:
|
792 |
+
output_transcript(o)
|
793 |
+
now = None
|
794 |
+
elif args.comp_unaware: # computational unaware mode
|
795 |
+
end = beg + min_chunk
|
796 |
+
while True:
|
797 |
+
a = load_audio_chunk(audio_path,beg,end)
|
798 |
+
online.insert_audio_chunk(a)
|
799 |
+
try:
|
800 |
+
o = online.process_iter()
|
801 |
+
except AssertionError as e:
|
802 |
+
logger.error(f"assertion error: {repr(e)}")
|
803 |
+
pass
|
804 |
+
else:
|
805 |
+
output_transcript(o, now=end)
|
806 |
+
|
807 |
+
logger.debug(f"## last processed {end:.2f}s")
|
808 |
+
|
809 |
+
if end >= duration:
|
810 |
+
break
|
811 |
+
|
812 |
+
beg = end
|
813 |
+
|
814 |
+
if end + min_chunk > duration:
|
815 |
+
end = duration
|
816 |
+
else:
|
817 |
+
end += min_chunk
|
818 |
+
now = duration
|
819 |
+
|
820 |
+
else: # online = simultaneous mode
|
821 |
+
end = 0
|
822 |
+
while True:
|
823 |
+
now = time.time() - start
|
824 |
+
if now < end+min_chunk:
|
825 |
+
time.sleep(min_chunk+end-now)
|
826 |
+
end = time.time() - start
|
827 |
+
a = load_audio_chunk(audio_path,beg,end)
|
828 |
+
beg = end
|
829 |
+
online.insert_audio_chunk(a)
|
830 |
+
|
831 |
+
try:
|
832 |
+
o = online.process_iter()
|
833 |
+
except AssertionError as e:
|
834 |
+
logger.error(f"assertion error: {e}")
|
835 |
+
pass
|
836 |
+
else:
|
837 |
+
output_transcript(o)
|
838 |
+
now = time.time() - start
|
839 |
+
logger.debug(f"## last processed {end:.2f} s, now is {now:.2f}, the latency is {now-end:.2f}")
|
840 |
+
|
841 |
+
if end >= duration:
|
842 |
+
break
|
843 |
+
now = None
|
844 |
+
|
845 |
+
o = online.finish()
|
846 |
+
output_transcript(o, now=now)
|
libs/whisper_streaming/whisper_online_server.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
from whisper_online import *
|
3 |
+
|
4 |
+
import sys
|
5 |
+
import argparse
|
6 |
+
import os
|
7 |
+
import logging
|
8 |
+
import numpy as np
|
9 |
+
|
10 |
+
logger = logging.getLogger(__name__)
|
11 |
+
parser = argparse.ArgumentParser()
|
12 |
+
|
13 |
+
# server options
|
14 |
+
parser.add_argument("--host", type=str, default='localhost')
|
15 |
+
parser.add_argument("--port", type=int, default=43007)
|
16 |
+
parser.add_argument("--warmup-file", type=str, dest="warmup_file",
|
17 |
+
help="The path to a speech audio wav file to warm up Whisper so that the very first chunk processing is fast. It can be e.g. https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav .")
|
18 |
+
|
19 |
+
# options from whisper_online
|
20 |
+
add_shared_args(parser)
|
21 |
+
args = parser.parse_args()
|
22 |
+
|
23 |
+
set_logging(args,logger,other="")
|
24 |
+
|
25 |
+
# setting whisper object by args
|
26 |
+
|
27 |
+
SAMPLING_RATE = 16000
|
28 |
+
|
29 |
+
size = args.model
|
30 |
+
language = args.lan
|
31 |
+
asr, online = asr_factory(args)
|
32 |
+
min_chunk = args.min_chunk_size
|
33 |
+
|
34 |
+
# warm up the ASR because the very first transcribe takes more time than the others.
|
35 |
+
# Test results in https://github.com/ufal/whisper_streaming/pull/81
|
36 |
+
msg = "Whisper is not warmed up. The first chunk processing may take longer."
|
37 |
+
if args.warmup_file:
|
38 |
+
if os.path.isfile(args.warmup_file):
|
39 |
+
a = load_audio_chunk(args.warmup_file,0,1)
|
40 |
+
asr.transcribe(a)
|
41 |
+
logger.info("Whisper is warmed up.")
|
42 |
+
else:
|
43 |
+
logger.critical("The warm up file is not available. "+msg)
|
44 |
+
sys.exit(1)
|
45 |
+
else:
|
46 |
+
logger.warning(msg)
|
47 |
+
|
48 |
+
|
49 |
+
######### Server objects
|
50 |
+
|
51 |
+
import line_packet
|
52 |
+
import socket
|
53 |
+
|
54 |
+
class Connection:
|
55 |
+
'''it wraps conn object'''
|
56 |
+
PACKET_SIZE = 32000*5*60 # 5 minutes # was: 65536
|
57 |
+
|
58 |
+
def __init__(self, conn):
|
59 |
+
self.conn = conn
|
60 |
+
self.last_line = ""
|
61 |
+
|
62 |
+
self.conn.setblocking(True)
|
63 |
+
|
64 |
+
def send(self, line):
|
65 |
+
'''it doesn't send the same line twice, because it was problematic in online-text-flow-events'''
|
66 |
+
if line == self.last_line:
|
67 |
+
return
|
68 |
+
line_packet.send_one_line(self.conn, line)
|
69 |
+
self.last_line = line
|
70 |
+
|
71 |
+
def receive_lines(self):
|
72 |
+
in_line = line_packet.receive_lines(self.conn)
|
73 |
+
return in_line
|
74 |
+
|
75 |
+
def non_blocking_receive_audio(self):
|
76 |
+
try:
|
77 |
+
r = self.conn.recv(self.PACKET_SIZE)
|
78 |
+
return r
|
79 |
+
except ConnectionResetError:
|
80 |
+
return None
|
81 |
+
|
82 |
+
|
83 |
+
import io
|
84 |
+
import soundfile
|
85 |
+
|
86 |
+
# wraps socket and ASR object, and serves one client connection.
|
87 |
+
# next client should be served by a new instance of this object
|
88 |
+
class ServerProcessor:
|
89 |
+
|
90 |
+
def __init__(self, c, online_asr_proc, min_chunk):
|
91 |
+
self.connection = c
|
92 |
+
self.online_asr_proc = online_asr_proc
|
93 |
+
self.min_chunk = min_chunk
|
94 |
+
|
95 |
+
self.last_end = None
|
96 |
+
|
97 |
+
self.is_first = True
|
98 |
+
|
99 |
+
def receive_audio_chunk(self):
|
100 |
+
# receive all audio that is available by this time
|
101 |
+
# blocks operation if less than self.min_chunk seconds is available
|
102 |
+
# unblocks if connection is closed or a chunk is available
|
103 |
+
out = []
|
104 |
+
minlimit = self.min_chunk*SAMPLING_RATE
|
105 |
+
while sum(len(x) for x in out) < minlimit:
|
106 |
+
raw_bytes = self.connection.non_blocking_receive_audio()
|
107 |
+
if not raw_bytes:
|
108 |
+
break
|
109 |
+
# print("received audio:",len(raw_bytes), "bytes", raw_bytes[:10])
|
110 |
+
sf = soundfile.SoundFile(io.BytesIO(raw_bytes), channels=1,endian="LITTLE",samplerate=SAMPLING_RATE, subtype="PCM_16",format="RAW")
|
111 |
+
audio, _ = librosa.load(sf,sr=SAMPLING_RATE,dtype=np.float32)
|
112 |
+
out.append(audio)
|
113 |
+
if not out:
|
114 |
+
return None
|
115 |
+
conc = np.concatenate(out)
|
116 |
+
if self.is_first and len(conc) < minlimit:
|
117 |
+
return None
|
118 |
+
self.is_first = False
|
119 |
+
return np.concatenate(out)
|
120 |
+
|
121 |
+
def format_output_transcript(self,o):
|
122 |
+
# output format in stdout is like:
|
123 |
+
# 0 1720 Takhle to je
|
124 |
+
# - the first two words are:
|
125 |
+
# - beg and end timestamp of the text segment, as estimated by Whisper model. The timestamps are not accurate, but they're useful anyway
|
126 |
+
# - the next words: segment transcript
|
127 |
+
|
128 |
+
# This function differs from whisper_online.output_transcript in the following:
|
129 |
+
# succeeding [beg,end] intervals are not overlapping because ELITR protocol (implemented in online-text-flow events) requires it.
|
130 |
+
# Therefore, beg, is max of previous end and current beg outputed by Whisper.
|
131 |
+
# Usually it differs negligibly, by appx 20 ms.
|
132 |
+
|
133 |
+
if o[0] is not None:
|
134 |
+
beg, end = o[0]*1000,o[1]*1000
|
135 |
+
if self.last_end is not None:
|
136 |
+
beg = max(beg, self.last_end)
|
137 |
+
|
138 |
+
self.last_end = end
|
139 |
+
print("%1.0f %1.0f %s" % (beg,end,o[2]),flush=True,file=sys.stderr)
|
140 |
+
return "%1.0f %1.0f %s" % (beg,end,o[2])
|
141 |
+
else:
|
142 |
+
logger.debug("No text in this segment")
|
143 |
+
return None
|
144 |
+
|
145 |
+
def send_result(self, o):
|
146 |
+
msg = self.format_output_transcript(o)
|
147 |
+
if msg is not None:
|
148 |
+
self.connection.send(msg)
|
149 |
+
|
150 |
+
def process(self):
|
151 |
+
# handle one client connection
|
152 |
+
self.online_asr_proc.init()
|
153 |
+
while True:
|
154 |
+
a = self.receive_audio_chunk()
|
155 |
+
if a is None:
|
156 |
+
break
|
157 |
+
self.online_asr_proc.insert_audio_chunk(a)
|
158 |
+
o = online.process_iter()
|
159 |
+
try:
|
160 |
+
self.send_result(o)
|
161 |
+
except BrokenPipeError:
|
162 |
+
logger.info("broken pipe -- connection closed?")
|
163 |
+
break
|
164 |
+
|
165 |
+
# o = online.finish() # this should be working
|
166 |
+
# self.send_result(o)
|
167 |
+
|
168 |
+
|
169 |
+
|
170 |
+
# server loop
|
171 |
+
|
172 |
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
173 |
+
s.bind((args.host, args.port))
|
174 |
+
s.listen(1)
|
175 |
+
logger.info('Listening on'+str((args.host, args.port)))
|
176 |
+
while True:
|
177 |
+
conn, addr = s.accept()
|
178 |
+
logger.info('Connected to client on {}'.format(addr))
|
179 |
+
connection = Connection(conn)
|
180 |
+
proc = ServerProcessor(connection, online, args.min_chunk_size)
|
181 |
+
proc.process()
|
182 |
+
conn.close()
|
183 |
+
logger.info('Connection to client closed')
|
184 |
+
logger.info('Connection closed, terminating.')
|