DuyTa commited on
Commit
c3b1078
1 Parent(s): 540fe46

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. whisper_pipeline/HA1.wav +3 -0
  2. whisper_pipeline/api.py +42 -0
  3. whisper_pipeline/check.py +6 -0
  4. whisper_pipeline/dockerfile +56 -0
  5. whisper_pipeline/faster-whisper-main/.github/workflows/ci.yml +90 -0
  6. whisper_pipeline/faster-whisper-main/.gitignore +15 -0
  7. whisper_pipeline/faster-whisper-main/CONTRIBUTING.md +31 -0
  8. whisper_pipeline/faster-whisper-main/LICENSE +21 -0
  9. whisper_pipeline/faster-whisper-main/MANIFEST.in +4 -0
  10. whisper_pipeline/faster-whisper-main/README.md +319 -0
  11. whisper_pipeline/faster-whisper-main/benchmark/benchmark.m4a +3 -0
  12. whisper_pipeline/faster-whisper-main/benchmark/memory_benchmark.py +94 -0
  13. whisper_pipeline/faster-whisper-main/benchmark/normalizer.json +1742 -0
  14. whisper_pipeline/faster-whisper-main/benchmark/requirements.benchmark.txt +6 -0
  15. whisper_pipeline/faster-whisper-main/benchmark/speed_benchmark.py +31 -0
  16. whisper_pipeline/faster-whisper-main/benchmark/utils.py +39 -0
  17. whisper_pipeline/faster-whisper-main/benchmark/wer_benchmark.py +64 -0
  18. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/__init__.py +14 -0
  19. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/__init__.py +0 -0
  20. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/pyannote_vad_model.bin +3 -0
  21. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/silero_vad.onnx +3 -0
  22. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/audio.py +58 -0
  23. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/feature_extractor.py +114 -0
  24. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/tokenizer.py +314 -0
  25. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/transcribe.py +2170 -0
  26. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/utils.py +157 -0
  27. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/vad.py +596 -0
  28. whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/version.py +3 -0
  29. whisper_pipeline/faster-whisper-main/docker/Dockerfile +6 -0
  30. whisper_pipeline/faster-whisper-main/docker/infer.py +7 -0
  31. whisper_pipeline/faster-whisper-main/docker/jfk.flac +3 -0
  32. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/PKG-INFO +347 -0
  33. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/SOURCES.txt +25 -0
  34. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/dependency_links.txt +1 -0
  35. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/requires.txt +17 -0
  36. whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/top_level.txt +1 -0
  37. whisper_pipeline/faster-whisper-main/faster_whisper/__init__.py +14 -0
  38. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/__init__.cpython-310.pyc +0 -0
  39. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/audio.cpython-310.pyc +0 -0
  40. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/feature_extractor.cpython-310.pyc +0 -0
  41. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/tokenizer.cpython-310.pyc +0 -0
  42. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/transcribe.cpython-310.pyc +0 -0
  43. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/utils.cpython-310.pyc +0 -0
  44. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/vad.cpython-310.pyc +0 -0
  45. whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/version.cpython-310.pyc +0 -0
  46. whisper_pipeline/faster-whisper-main/faster_whisper/assets/__init__.py +0 -0
  47. whisper_pipeline/faster-whisper-main/faster_whisper/assets/pyannote_vad_model.bin +3 -0
  48. whisper_pipeline/faster-whisper-main/faster_whisper/assets/silero_vad.onnx +3 -0
  49. whisper_pipeline/faster-whisper-main/faster_whisper/audio.py +58 -0
  50. whisper_pipeline/faster-whisper-main/faster_whisper/feature_extractor.py +114 -0
whisper_pipeline/HA1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87fd3e947f85de5aeeae4d2f34a4774370541acf92e0f3317686e3c70572aa6a
3
+ size 1242438
whisper_pipeline/api.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File
2
+ from fastapi.responses import JSONResponse
3
+ from pathlib import Path
4
+ import os
5
+ from gector import GecBERTModel
6
+ from faster_whisper import WhisperModel, BatchedInferencePipeline
7
+ from transformers.models.whisper.english_normalizer import BasicTextNormalizer
8
+ from text_processing.inverse_normalize import InverseNormalizer
9
+ import shutil
10
+ import uvicorn
11
+
12
+ # Initialize the FastAPI app
13
+ app = FastAPI()
14
+
15
+ # Initialize models and normalizer
16
+ current_dir = Path(__file__).parent.as_posix()
17
+ inverse_normalizer = InverseNormalizer('vi')
18
+ whisper_model = WhisperModel("pho_distill_q8", device="auto", compute_type="auto")
19
+ batched_model = BatchedInferencePipeline(model=whisper_model, use_vad_model=True, chunk_length=15)
20
+ gector_model = GecBERTModel(
21
+ vocab_path=os.path.join(current_dir, "gector/vocabulary"),
22
+ model_paths=[os.path.join(current_dir, "gector/Model_GECTOR")],
23
+ split_chunk=True
24
+ )
25
+ normalizer = BasicTextNormalizer()
26
+
27
+ @app.post("/transcriptions")
28
+ async def transcribe_audio(file: UploadFile = File(...)):
29
+ # Save the uploaded file temporarily
30
+ temp_file_path = Path(f"temp_{file.filename}")
31
+ with open(temp_file_path, "wb") as buffer:
32
+ shutil.copyfileobj(file.file, buffer)
33
+ segments, info = batched_model.transcribe(str(temp_file_path), language="vi", batch_size=32)
34
+ os.remove(temp_file_path)
35
+ transcriptions = [segment.text for segment in segments]
36
+ normalized_transcriptions = [inverse_normalizer.inverse_normalize(normalizer(text)) for text in transcriptions]
37
+ corrected_texts = gector_model(normalized_transcriptions)
38
+ return JSONResponse({"text": ' '.join(corrected_texts)})
39
+
40
+
41
+ if __name__ == "__main__":
42
+ uvicorn.run("api:app", host="0.0.0.0", port=8000, reload=True)
whisper_pipeline/check.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from text_processing.inverse_normalize import InverseNormalizer
2
+ import time
3
+ normalizer = InverseNormalizer('vi')
4
+ start = time.time()
5
+ print(normalizer.inverse_normalize("mười hai ki lô gram"))
6
+ print(time.time()- start)
whisper_pipeline/dockerfile ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use nvidia/cuda as base image with Python
2
+ FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
3
+
4
+ # Use args
5
+ ARG USE_CUDA
6
+ ARG USE_CUDA_VER
7
+
8
+ ## Basis ##
9
+ ENV ENV=prod \
10
+ PORT=8000 \
11
+ USE_CUDA_DOCKER=${USE_CUDA} \
12
+ USE_CUDA_DOCKER_VER=${USE_CUDA_VER}
13
+
14
+ # Install GCC and build tools
15
+ RUN apt-get update && \
16
+ apt-get install -y gcc build-essential curl git pkg-config libicu-dev && \
17
+ apt-get clean && \
18
+ rm -rf /var/lib/apt/lists/*
19
+
20
+ RUN apt-get update -y && apt-get install -y python3-pip
21
+
22
+
23
+
24
+ # Set working directory
25
+ WORKDIR /app
26
+
27
+ # Copy the requirements.txt file and install dependencies
28
+ COPY ./requirements.txt .
29
+
30
+ # Install dependencies
31
+ RUN pip install uv && \
32
+ if [ "$USE_CUDA" = "true" ]; then \
33
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/$USE_CUDA_DOCKER_VER --no-cache-dir; \
34
+ else \
35
+ pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu --no-cache-dir; \
36
+ fi
37
+
38
+ # Copy faster-whisper-main folder and install
39
+ COPY ./faster-whisper-main ./faster-whisper-main
40
+ RUN pip install ./faster-whisper-main --no-cache-dir
41
+
42
+ RUN pip install --no-cache-dir -r requirements.txt
43
+
44
+
45
+ # Copy the remaining application code
46
+ COPY . .
47
+
48
+ # Expose the API port
49
+ EXPOSE 8000
50
+
51
+ # Set the environment variables
52
+ ENV HOST="0.0.0.0"
53
+ ENV PORT="8000"
54
+
55
+ # Set entrypoint to run the FastAPI server
56
+ ENTRYPOINT ["bash", "start.sh"]
whisper_pipeline/faster-whisper-main/.github/workflows/ci.yml ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - master
7
+ tags:
8
+ - v*
9
+ pull_request:
10
+ branches:
11
+ - master
12
+
13
+ jobs:
14
+ check-code-format:
15
+ runs-on: ubuntu-latest
16
+
17
+ steps:
18
+ - uses: actions/checkout@v3
19
+
20
+ - name: Set up Python 3.8
21
+ uses: actions/setup-python@v4
22
+ with:
23
+ python-version: 3.8
24
+
25
+ - name: Install module
26
+ run: |
27
+ pip install wheel
28
+ pip install -e .[dev]
29
+
30
+ - name: Check code format with Black
31
+ run: |
32
+ black --check .
33
+
34
+ - name: Check imports order with isort
35
+ run: |
36
+ isort --check-only .
37
+
38
+ - name: Check code style with Flake8
39
+ if: ${{ always() }}
40
+ run: |
41
+ flake8 .
42
+
43
+
44
+ run-tests:
45
+ runs-on: ubuntu-latest
46
+
47
+ steps:
48
+ - uses: actions/checkout@v3
49
+
50
+ - name: Set up Python 3.8
51
+ uses: actions/setup-python@v4
52
+ with:
53
+ python-version: 3.8
54
+
55
+ - name: Install module
56
+ run: |
57
+ pip install wheel
58
+ pip install -e .[dev]
59
+
60
+ - name: Run pytest
61
+ run: |
62
+ pytest -v tests/
63
+
64
+
65
+ build-and-push-package:
66
+ runs-on: ubuntu-latest
67
+ needs: [check-code-format, run-tests]
68
+
69
+ steps:
70
+ - uses: actions/checkout@v3
71
+
72
+ - name: Set up Python 3.8
73
+ uses: actions/setup-python@v4
74
+ with:
75
+ python-version: 3.8
76
+
77
+ - name: Install dependencies
78
+ run: |
79
+ pip install wheel
80
+
81
+ - name: Build package
82
+ run: |
83
+ python3 setup.py sdist bdist_wheel
84
+
85
+ - name: Push package on PyPI
86
+ if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
87
+ uses: pypa/gh-action-pypi-publish@release/v1
88
+ with:
89
+ user: __token__
90
+ password: ${{ secrets.PYPI_API_TOKEN }}
whisper_pipeline/faster-whisper-main/.gitignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / Optimized / DLL Files
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ __pycache__/
6
+
7
+ # Distribution / Packaging
8
+ venv/
9
+
10
+ # Unit Test
11
+ .pytest_cache/
12
+
13
+ # Ignore IDE, Editor Files
14
+ .idea/
15
+ .vscode/
whisper_pipeline/faster-whisper-main/CONTRIBUTING.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributing to faster-whisper
2
+
3
+ Contributions are welcome! Here are some pointers to help you install the library for development and validate your changes before submitting a pull request.
4
+
5
+ ## Install the library for development
6
+
7
+ We recommend installing the module in editable mode with the `dev` extra requirements:
8
+
9
+ ```bash
10
+ git clone https://github.com/SYSTRAN/faster-whisper.git
11
+ cd faster-whisper/
12
+ pip install -e .[dev]
13
+ ```
14
+
15
+ ## Validate the changes before creating a pull request
16
+
17
+ 1. Make sure the existing tests are still passing (and consider adding new tests as well!):
18
+
19
+ ```bash
20
+ pytest tests/
21
+ ```
22
+
23
+ 2. Reformat and validate the code with the following tools:
24
+
25
+ ```bash
26
+ black .
27
+ isort .
28
+ flake8 .
29
+ ```
30
+
31
+ These steps are also run automatically in the CI when you open the pull request.
whisper_pipeline/faster-whisper-main/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 SYSTRAN
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
whisper_pipeline/faster-whisper-main/MANIFEST.in ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ include faster_whisper/assets/silero_vad.onnx
2
+ include requirements.txt
3
+ include requirements.conversion.txt
4
+ include faster_whisper/assets/pyannote_vad_model.bin
whisper_pipeline/faster-whisper-main/README.md ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)
2
+
3
+ # Faster Whisper transcription with CTranslate2
4
+
5
+ **faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.
6
+
7
+ This implementation is up to 4 times faster than [openai/whisper](https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.
8
+
9
+ ## Benchmark
10
+
11
+ ### Whisper
12
+
13
+ For reference, here's the time and memory usage that are required to transcribe [**13 minutes**](https://www.youtube.com/watch?v=0u7tTptBo9I) of audio using different implementations:
14
+
15
+ * [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258)
16
+ * [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362)
17
+ * [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e)
18
+
19
+ ### Large-v2 model on GPU
20
+
21
+ | Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
22
+ | --- | --- | --- | --- | --- | --- |
23
+ | openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
24
+ | faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
25
+ | faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB |
26
+
27
+ *Executed with CUDA 11.7.1 on a NVIDIA Tesla V100S.*
28
+
29
+ ### Small model on CPU
30
+
31
+ | Implementation | Precision | Beam size | Time | Max. memory |
32
+ | --- | --- | --- | --- | --- |
33
+ | openai/whisper | fp32 | 5 | 10m31s | 3101MB |
34
+ | whisper.cpp | fp32 | 5 | 17m42s | 1581MB |
35
+ | whisper.cpp | fp16 | 5 | 12m39s | 873MB |
36
+ | faster-whisper | fp32 | 5 | 2m44s | 1675MB |
37
+ | faster-whisper | int8 | 5 | 2m04s | 995MB |
38
+
39
+ *Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.*
40
+
41
+
42
+ ### Distil-whisper
43
+
44
+ | Implementation | Precision | Beam size | Time | Gigaspeech WER |
45
+ | --- | --- | --- | --- | --- |
46
+ | distil-whisper/distil-large-v2 | fp16 | 4 |- | 10.36 |
47
+ | [faster-distil-large-v2](https://huggingface.co/Systran/faster-distil-whisper-large-v2) | fp16 | 5 | - | 10.28 |
48
+ | distil-whisper/distil-medium.en | fp16 | 4 | - | 11.21 |
49
+ | [faster-distil-medium.en](https://huggingface.co/Systran/faster-distil-whisper-medium.en) | fp16 | 5 | - | 11.21 |
50
+
51
+ *Executed with CUDA 11.4 on a NVIDIA 3090.*
52
+
53
+ <details>
54
+ <summary>testing details (click to expand)</summary>
55
+
56
+ For `distil-whisper/distil-large-v2`, the WER is tested with code sample from [link](https://huggingface.co/distil-whisper/distil-large-v2#evaluation). for `faster-distil-whisper`, the WER is tested with setting:
57
+ ```python
58
+ from faster_whisper import WhisperModel
59
+
60
+ model_size = "distil-large-v2"
61
+ # model_size = "distil-medium.en"
62
+ # Run on GPU with FP16
63
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
64
+ segments, info = model.transcribe("audio.mp3", beam_size=5, language="en")
65
+ ```
66
+ </details>
67
+
68
+ ## Requirements
69
+
70
+ * Python 3.8 or greater
71
+
72
+
73
+ ### GPU
74
+
75
+ GPU execution requires the following NVIDIA libraries to be installed:
76
+
77
+ * [cuBLAS for CUDA 12](https://developer.nvidia.com/cublas)
78
+ * [cuDNN 8 for CUDA 12](https://developer.nvidia.com/cudnn)
79
+
80
+ **Note**: Latest versions of `ctranslate2` support CUDA 12 only. For CUDA 11, the current workaround is downgrading to the `3.24.0` version of `ctranslate2` (This can be done with `pip install --force-reinstall ctranslate2==3.24.0` or specifying the version in a `requirements.txt`).
81
+
82
+ There are multiple ways to install the NVIDIA libraries mentioned above. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below.
83
+
84
+ <details>
85
+ <summary>Other installation methods (click to expand)</summary>
86
+
87
+
88
+ **Note:** For all these methods below, keep in mind the above note regarding CUDA versions. Depending on your setup, you may need to install the _CUDA 11_ versions of libraries that correspond to the CUDA 12 libraries listed in the instructions below.
89
+
90
+ #### Use Docker
91
+
92
+ The libraries (cuBLAS, cuDNN) are installed in these official NVIDIA CUDA Docker images: `nvidia/cuda:12.0.0-runtime-ubuntu20.04` or `nvidia/cuda:12.0.0-runtime-ubuntu22.04`.
93
+
94
+ #### Install with `pip` (Linux only)
95
+
96
+ On Linux these libraries can be installed with `pip`. Note that `LD_LIBRARY_PATH` must be set before launching Python.
97
+
98
+ ```bash
99
+ pip install nvidia-cublas-cu12 nvidia-cudnn-cu12
100
+
101
+ export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
102
+ ```
103
+
104
+ **Note**: Version 9+ of `nvidia-cudnn-cu12` appears to cause issues due its reliance on cuDNN 9 (Faster-Whisper does not currently support cuDNN 9). Ensure your version of the Python package is for cuDNN 8.
105
+
106
+ #### Download the libraries from Purfview's repository (Windows & Linux)
107
+
108
+ Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`.
109
+
110
+ </details>
111
+
112
+ ## Installation
113
+
114
+ The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/):
115
+
116
+ ```bash
117
+ pip install faster-whisper
118
+ ```
119
+
120
+ <details>
121
+ <summary>Other installation methods (click to expand)</summary>
122
+
123
+ ### Install the master branch
124
+
125
+ ```bash
126
+ pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz"
127
+ ```
128
+
129
+ ### Install a specific commit
130
+
131
+ ```bash
132
+ pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz"
133
+ ```
134
+
135
+ </details>
136
+
137
+ ## Usage
138
+
139
+ ### Faster-whisper
140
+
141
+ ```python
142
+ from faster_whisper import WhisperModel
143
+
144
+ model_size = "large-v3"
145
+
146
+ # Run on GPU with FP16
147
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
148
+
149
+ # or run on GPU with INT8
150
+ # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
151
+ # or run on CPU with INT8
152
+ # model = WhisperModel(model_size, device="cpu", compute_type="int8")
153
+
154
+ segments, info = model.transcribe("audio.mp3", beam_size=5)
155
+
156
+ print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
157
+
158
+ for segment in segments:
159
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
160
+ ```
161
+
162
+ **Warning:** `segments` is a *generator* so the transcription only starts when you iterate over it. The transcription can be run to completion by gathering the segments in a list or a `for` loop:
163
+
164
+ ```python
165
+ segments, _ = model.transcribe("audio.mp3")
166
+ segments = list(segments) # The transcription will actually run here.
167
+ ```
168
+
169
+ ### multi-segment language detection
170
+
171
+ To directly use the model for improved language detection, the following code snippet can be used:
172
+
173
+ ```python
174
+ from faster_whisper import WhisperModel
175
+ model = WhisperModel("medium", device="cuda", compute_type="float16")
176
+ language_info = model.detect_language_multi_segment("audio.mp3")
177
+ ```
178
+
179
+ ### Batched faster-whisper
180
+
181
+
182
+ The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-2 Clause license and integrates its VAD model to this library. We modify this implementation and also replaced the feature extraction with a faster torch-based implementation. Batched version improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference.
183
+
184
+ The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.
185
+
186
+ ```python
187
+ from faster_whisper import WhisperModel, BatchedInferencePipeline
188
+
189
+ model = WhisperModel("medium", device="cuda", compute_type="float16")
190
+ batched_model = BatchedInferencePipeline(model=model)
191
+ segments, info = batched_model.transcribe("audio.mp3", batch_size=16)
192
+
193
+ for segment in segments:
194
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
195
+ ```
196
+
197
+ ### Faster Distil-Whisper
198
+
199
+ The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3)
200
+ checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet
201
+ demonstrates how to run inference with distil-large-v3 on a specified audio file:
202
+
203
+ ```python
204
+ from faster_whisper import WhisperModel
205
+
206
+ model_size = "distil-large-v3"
207
+
208
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
209
+ segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False)
210
+
211
+ for segment in segments:
212
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
213
+ ```
214
+
215
+ For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3).
216
+
217
+ ### Word-level timestamps
218
+
219
+ ```python
220
+ segments, _ = model.transcribe("audio.mp3", word_timestamps=True)
221
+
222
+ for segment in segments:
223
+ for word in segment.words:
224
+ print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
225
+ ```
226
+
227
+ ### VAD filter
228
+
229
+ The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) model to filter out parts of the audio without speech:
230
+
231
+ ```python
232
+ segments, _ = model.transcribe("audio.mp3", vad_filter=True)
233
+ ```
234
+
235
+ The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`:
236
+
237
+ ```python
238
+ segments, _ = model.transcribe(
239
+ "audio.mp3",
240
+ vad_filter=True,
241
+ vad_parameters=dict(min_silence_duration_ms=500),
242
+ )
243
+ ```
244
+
245
+ ### Logging
246
+
247
+ The library logging level can be configured like this:
248
+
249
+ ```python
250
+ import logging
251
+
252
+ logging.basicConfig()
253
+ logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
254
+ ```
255
+
256
+ ### Going further
257
+
258
+ See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation.
259
+
260
+ ## Community integrations
261
+
262
+ Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list!
263
+
264
+
265
+ * [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) is an OpenAI compatible server using `faster-whisper`. It's easily deployable with Docker, works with OpenAI SDKs/CLI, supports streaming, and live transcription.
266
+ * [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment
267
+ * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper.
268
+ * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo.
269
+ * [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS.
270
+ * [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines.
271
+ * [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT.
272
+ * [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor)
273
+ * [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux.
274
+ * [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art.
275
+ * [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time.
276
+ * [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface.
277
+
278
+ ## Model conversion
279
+
280
+ When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
281
+
282
+ We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
283
+
284
+ For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16:
285
+
286
+ ```bash
287
+ pip install transformers[torch]>=4.23
288
+
289
+ ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2
290
+ --copy_files tokenizer.json preprocessor_config.json --quantization float16
291
+ ```
292
+
293
+ * The option `--model` accepts a model name on the Hub or a path to a model directory.
294
+ * If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later.
295
+
296
+ Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
297
+
298
+ ### Load a converted model
299
+
300
+ 1. Directly load the model from a local directory:
301
+ ```python
302
+ model = faster_whisper.WhisperModel("whisper-large-v3-ct2")
303
+ ```
304
+
305
+ 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
306
+ ```python
307
+ model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2")
308
+ ```
309
+
310
+ ## Comparing performance against other implementations
311
+
312
+ If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular:
313
+
314
+ * Verify that the same transcription options are used, especially the same beam size. For example in openai/whisper, `model.transcribe` uses a default beam size of 1 but here we use a default beam size of 5.
315
+ * When running on CPU, make sure to set the same number of threads. Many frameworks will read the environment variable `OMP_NUM_THREADS`, which can be set when running your script:
316
+
317
+ ```bash
318
+ OMP_NUM_THREADS=4 python3 my_script.py
319
+ ```
whisper_pipeline/faster-whisper-main/benchmark/benchmark.m4a ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dedec4f587a7940cfab93ff36e5014f155f80e10b7935f67d9eee8761663c34
3
+ size 12935433
whisper_pipeline/faster-whisper-main/benchmark/memory_benchmark.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import time
3
+
4
+ from typing import Callable
5
+
6
+ import py3nvml.py3nvml as nvml
7
+
8
+ from memory_profiler import memory_usage
9
+ from utils import MyThread, get_logger, inference
10
+
11
+ logger = get_logger("faster-whisper")
12
+ parser = argparse.ArgumentParser(description="Memory benchmark")
13
+ parser.add_argument(
14
+ "--gpu_memory", action="store_true", help="Measure GPU memory usage"
15
+ )
16
+ parser.add_argument("--device-index", type=int, default=0, help="GPU device index")
17
+ parser.add_argument(
18
+ "--interval",
19
+ type=float,
20
+ default=0.5,
21
+ help="Interval at which measurements are collected",
22
+ )
23
+ args = parser.parse_args()
24
+ device_idx = args.device_index
25
+ interval = args.interval
26
+
27
+
28
+ def measure_memory(func: Callable[[], None]):
29
+ if args.gpu_memory:
30
+ logger.info(
31
+ "Measuring maximum GPU memory usage on GPU device."
32
+ " Make sure to not have additional processes running on the same GPU."
33
+ )
34
+ # init nvml
35
+ nvml.nvmlInit()
36
+ handle = nvml.nvmlDeviceGetHandleByIndex(device_idx)
37
+ gpu_name = nvml.nvmlDeviceGetName(handle)
38
+ gpu_memory_limit = nvml.nvmlDeviceGetMemoryInfo(handle).total >> 20
39
+ gpu_power_limit = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000.0
40
+ info = {"gpu_memory_usage": [], "gpu_power_usage": []}
41
+
42
+ def _get_gpu_info():
43
+ while True:
44
+ info["gpu_memory_usage"].append(
45
+ nvml.nvmlDeviceGetMemoryInfo(handle).used >> 20
46
+ )
47
+ info["gpu_power_usage"].append(
48
+ nvml.nvmlDeviceGetPowerUsage(handle) / 1000
49
+ )
50
+ time.sleep(interval)
51
+
52
+ if stop:
53
+ break
54
+
55
+ return info
56
+
57
+ stop = False
58
+ thread = MyThread(_get_gpu_info, params=())
59
+ thread.start()
60
+ func()
61
+ stop = True
62
+ thread.join()
63
+ result = thread.get_result()
64
+
65
+ # shutdown nvml
66
+ nvml.nvmlShutdown()
67
+ max_memory_usage = max(result["gpu_memory_usage"])
68
+ max_power_usage = max(result["gpu_power_usage"])
69
+ print("GPU name: %s" % gpu_name)
70
+ print("GPU device index: %s" % device_idx)
71
+ print(
72
+ "Maximum GPU memory usage: %dMiB / %dMiB (%.2f%%)"
73
+ % (
74
+ max_memory_usage,
75
+ gpu_memory_limit,
76
+ (max_memory_usage / gpu_memory_limit) * 100,
77
+ )
78
+ )
79
+ print(
80
+ "Maximum GPU power usage: %dW / %dW (%.2f%%)"
81
+ % (
82
+ max_power_usage,
83
+ gpu_power_limit,
84
+ (max_power_usage / gpu_power_limit) * 100,
85
+ )
86
+ )
87
+ else:
88
+ logger.info("Measuring maximum increase of memory usage.")
89
+ max_usage = memory_usage(func, max_usage=True, interval=interval)
90
+ print("Maximum increase of RAM memory usage: %d MiB" % max_usage)
91
+
92
+
93
+ if __name__ == "__main__":
94
+ measure_memory(inference)
whisper_pipeline/faster-whisper-main/benchmark/normalizer.json ADDED
@@ -0,0 +1,1742 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "accessorise": "accessorize",
3
+ "accessorised": "accessorized",
4
+ "accessorises": "accessorizes",
5
+ "accessorising": "accessorizing",
6
+ "acclimatisation": "acclimatization",
7
+ "acclimatise": "acclimatize",
8
+ "acclimatised": "acclimatized",
9
+ "acclimatises": "acclimatizes",
10
+ "acclimatising": "acclimatizing",
11
+ "accoutrements": "accouterments",
12
+ "aeon": "eon",
13
+ "aeons": "eons",
14
+ "aerogramme": "aerogram",
15
+ "aerogrammes": "aerograms",
16
+ "aeroplane": "airplane",
17
+ "aeroplanes": "airplanes",
18
+ "aesthete": "esthete",
19
+ "aesthetes": "esthetes",
20
+ "aesthetic": "esthetic",
21
+ "aesthetically": "esthetically",
22
+ "aesthetics": "esthetics",
23
+ "aetiology": "etiology",
24
+ "ageing": "aging",
25
+ "aggrandisement": "aggrandizement",
26
+ "agonise": "agonize",
27
+ "agonised": "agonized",
28
+ "agonises": "agonizes",
29
+ "agonising": "agonizing",
30
+ "agonisingly": "agonizingly",
31
+ "almanack": "almanac",
32
+ "almanacks": "almanacs",
33
+ "aluminium": "aluminum",
34
+ "amortisable": "amortizable",
35
+ "amortisation": "amortization",
36
+ "amortisations": "amortizations",
37
+ "amortise": "amortize",
38
+ "amortised": "amortized",
39
+ "amortises": "amortizes",
40
+ "amortising": "amortizing",
41
+ "amphitheatre": "amphitheater",
42
+ "amphitheatres": "amphitheaters",
43
+ "anaemia": "anemia",
44
+ "anaemic": "anemic",
45
+ "anaesthesia": "anesthesia",
46
+ "anaesthetic": "anesthetic",
47
+ "anaesthetics": "anesthetics",
48
+ "anaesthetise": "anesthetize",
49
+ "anaesthetised": "anesthetized",
50
+ "anaesthetises": "anesthetizes",
51
+ "anaesthetising": "anesthetizing",
52
+ "anaesthetist": "anesthetist",
53
+ "anaesthetists": "anesthetists",
54
+ "anaesthetize": "anesthetize",
55
+ "anaesthetized": "anesthetized",
56
+ "anaesthetizes": "anesthetizes",
57
+ "anaesthetizing": "anesthetizing",
58
+ "analogue": "analog",
59
+ "analogues": "analogs",
60
+ "analyse": "analyze",
61
+ "analysed": "analyzed",
62
+ "analyses": "analyzes",
63
+ "analysing": "analyzing",
64
+ "anglicise": "anglicize",
65
+ "anglicised": "anglicized",
66
+ "anglicises": "anglicizes",
67
+ "anglicising": "anglicizing",
68
+ "annualised": "annualized",
69
+ "antagonise": "antagonize",
70
+ "antagonised": "antagonized",
71
+ "antagonises": "antagonizes",
72
+ "antagonising": "antagonizing",
73
+ "apologise": "apologize",
74
+ "apologised": "apologized",
75
+ "apologises": "apologizes",
76
+ "apologising": "apologizing",
77
+ "appal": "appall",
78
+ "appals": "appalls",
79
+ "appetiser": "appetizer",
80
+ "appetisers": "appetizers",
81
+ "appetising": "appetizing",
82
+ "appetisingly": "appetizingly",
83
+ "arbour": "arbor",
84
+ "arbours": "arbors",
85
+ "archaeologically": "archeologically",
86
+ "archaeologist": "archeologist",
87
+ "archaeologists": "archeologists",
88
+ "archaeology": "archeology</span>",
89
+ "archeological": "archaeological",
90
+ "ardour": "ardor",
91
+ "armour": "armor",
92
+ "armoured": "armored",
93
+ "armourer": "armorer",
94
+ "armourers": "armorers",
95
+ "armouries": "armories",
96
+ "armoury": "armory",
97
+ "artefact": "artifact",
98
+ "artefacts": "artifacts",
99
+ "authorise": "authorize",
100
+ "authorised": "authorized",
101
+ "authorises": "authorizes",
102
+ "authorising": "authorizing",
103
+ "axe": "ax",
104
+ "backpedalled": "backpedaled",
105
+ "backpedalling": "backpedaling",
106
+ "bannister": "banister",
107
+ "bannisters": "banisters",
108
+ "baptise": "baptize",
109
+ "baptised": "baptized",
110
+ "baptises": "baptizes",
111
+ "baptising": "baptizing",
112
+ "bastardise": "bastardize",
113
+ "bastardised": "bastardized",
114
+ "bastardises": "bastardizes",
115
+ "bastardising": "bastardizing",
116
+ "battleax": "battleaxe",
117
+ "baulk": "balk",
118
+ "baulked": "balked",
119
+ "baulking": "balking",
120
+ "baulks": "balks",
121
+ "bedevilled": "bedeviled",
122
+ "bedevilling": "bedeviling",
123
+ "behaviour": "behavior",
124
+ "behavioural": "behavioral",
125
+ "behaviourism": "behaviorism",
126
+ "behaviourist": "behaviorist",
127
+ "behaviourists": "behaviorists",
128
+ "behaviours": "behaviors",
129
+ "behove": "behoove",
130
+ "behoved": "behooved",
131
+ "behoves": "behooves",
132
+ "bejewelled": "bejeweled",
133
+ "belabour": "belabor",
134
+ "belaboured": "belabored",
135
+ "belabouring": "belaboring",
136
+ "belabours": "belabors",
137
+ "bevelled": "beveled",
138
+ "bevvies": "bevies",
139
+ "bevvy": "bevy",
140
+ "biassed": "biased",
141
+ "biassing": "biasing",
142
+ "bingeing": "binging",
143
+ "bougainvillaea": "bougainvillea",
144
+ "bougainvillaeas": "bougainvilleas",
145
+ "bowdlerise": "bowdlerize",
146
+ "bowdlerised": "bowdlerized",
147
+ "bowdlerises": "bowdlerizes",
148
+ "bowdlerising": "bowdlerizing",
149
+ "breathalyse": "breathalyze",
150
+ "breathalysed": "breathalyzed",
151
+ "breathalyser": "breathalyzer",
152
+ "breathalysers": "breathalyzers",
153
+ "breathalyses": "breathalyzes",
154
+ "breathalysing": "breathalyzing",
155
+ "brutalise": "brutalize",
156
+ "brutalised": "brutalized",
157
+ "brutalises": "brutalizes",
158
+ "brutalising": "brutalizing",
159
+ "busses": "buses",
160
+ "bussing": "busing",
161
+ "caesarean": "cesarean",
162
+ "caesareans": "cesareans",
163
+ "calibre": "caliber",
164
+ "calibres": "calibers",
165
+ "calliper": "caliper",
166
+ "callipers": "calipers",
167
+ "callisthenics": "calisthenics",
168
+ "canalise": "canalize",
169
+ "canalised": "canalized",
170
+ "canalises": "canalizes",
171
+ "canalising": "canalizing",
172
+ "cancelation": "cancellation",
173
+ "cancelations": "cancellations",
174
+ "cancelled": "canceled",
175
+ "cancelling": "canceling",
176
+ "candour": "candor",
177
+ "cannibalise": "cannibalize",
178
+ "cannibalised": "cannibalized",
179
+ "cannibalises": "cannibalizes",
180
+ "cannibalising": "cannibalizing",
181
+ "canonise": "canonize",
182
+ "canonised": "canonized",
183
+ "canonises": "canonizes",
184
+ "canonising": "canonizing",
185
+ "capitalise": "capitalize",
186
+ "capitalised": "capitalized",
187
+ "capitalises": "capitalizes",
188
+ "capitalising": "capitalizing",
189
+ "caramelise": "caramelize",
190
+ "caramelised": "caramelized",
191
+ "caramelises": "caramelizes",
192
+ "caramelising": "caramelizing",
193
+ "carbonise": "carbonize",
194
+ "carbonised": "carbonized",
195
+ "carbonises": "carbonizes",
196
+ "carbonising": "carbonizing",
197
+ "carolled": "caroled",
198
+ "carolling": "caroling",
199
+ "catalogue": "catalog",
200
+ "catalogued": "cataloged",
201
+ "catalogues": "catalogs",
202
+ "cataloguing": "cataloging",
203
+ "catalyse": "catalyze",
204
+ "catalysed": "catalyzed",
205
+ "catalyses": "catalyzes",
206
+ "catalysing": "catalyzing",
207
+ "categorise": "categorize",
208
+ "categorised": "categorized",
209
+ "categorises": "categorizes",
210
+ "categorising": "categorizing",
211
+ "cauterise": "cauterize",
212
+ "cauterised": "cauterized",
213
+ "cauterises": "cauterizes",
214
+ "cauterising": "cauterizing",
215
+ "cavilled": "caviled",
216
+ "cavilling": "caviling",
217
+ "centigramme": "centigram",
218
+ "centigrammes": "centigrams",
219
+ "centilitre": "centiliter",
220
+ "centilitres": "centiliters",
221
+ "centimetre": "centimeter",
222
+ "centimetres": "centimeters",
223
+ "centralise": "centralize",
224
+ "centralised": "centralized",
225
+ "centralises": "centralizes",
226
+ "centralising": "centralizing",
227
+ "centre": "center",
228
+ "centred": "centered",
229
+ "centrefold": "centerfold",
230
+ "centrefolds": "centerfolds",
231
+ "centrepiece": "centerpiece",
232
+ "centrepieces": "centerpieces",
233
+ "centres": "centers",
234
+ "channelled": "channeled",
235
+ "channelling": "channeling",
236
+ "characterise": "characterize",
237
+ "characterised": "characterized",
238
+ "characterises": "characterizes",
239
+ "characterising": "characterizing",
240
+ "cheque": "check",
241
+ "chequebook": "checkbook",
242
+ "chequebooks": "checkbooks",
243
+ "chequered": "checkered",
244
+ "cheques": "checks",
245
+ "chilli": "chili",
246
+ "chimaera": "chimera",
247
+ "chimaeras": "chimeras",
248
+ "chiselled": "chiseled",
249
+ "chiselling": "chiseling",
250
+ "circularise": "circularize",
251
+ "circularised": "circularized",
252
+ "circularises": "circularizes",
253
+ "circularising": "circularizing",
254
+ "civilise": "civilize",
255
+ "civilised": "civilized",
256
+ "civilises": "civilizes",
257
+ "civilising": "civilizing",
258
+ "clamour": "clamor",
259
+ "clamoured": "clamored",
260
+ "clamouring": "clamoring",
261
+ "clamours": "clamors",
262
+ "clangour": "clangor",
263
+ "clarinettist": "clarinetist",
264
+ "clarinettists": "clarinetists",
265
+ "collectivise": "collectivize",
266
+ "collectivised": "collectivized",
267
+ "collectivises": "collectivizes",
268
+ "collectivising": "collectivizing",
269
+ "colonisation": "colonization",
270
+ "colonise": "colonize",
271
+ "colonised": "colonized",
272
+ "coloniser": "colonizer",
273
+ "colonisers": "colonizers",
274
+ "colonises": "colonizes",
275
+ "colonising": "colonizing",
276
+ "colour": "color",
277
+ "colourant": "colorant",
278
+ "colourants": "colorants",
279
+ "coloured": "colored",
280
+ "coloureds": "coloreds",
281
+ "colourful": "colorful",
282
+ "colourfully": "colorfully",
283
+ "colouring": "coloring",
284
+ "colourize": "colorize",
285
+ "colourized": "colorized",
286
+ "colourizes": "colorizes",
287
+ "colourizing": "colorizing",
288
+ "colourless": "colorless",
289
+ "colours": "colors",
290
+ "commercialise": "commercialize",
291
+ "commercialised": "commercialized",
292
+ "commercialises": "commercializes",
293
+ "commercialising": "commercializing",
294
+ "compartmentalise": "compartmentalize",
295
+ "compartmentalised": "compartmentalized",
296
+ "compartmentalises": "compartmentalizes",
297
+ "compartmentalising": "compartmentalizing",
298
+ "computerise": "computerize",
299
+ "computerised": "computerized",
300
+ "computerises": "computerizes",
301
+ "computerising": "computerizing",
302
+ "conceptualise": "conceptualize",
303
+ "conceptualised": "conceptualized",
304
+ "conceptualises": "conceptualizes",
305
+ "conceptualising": "conceptualizing",
306
+ "connexion": "connection",
307
+ "connexions": "connections",
308
+ "contextualise": "contextualize",
309
+ "contextualised": "contextualized",
310
+ "contextualises": "contextualizes",
311
+ "contextualising": "contextualizing",
312
+ "cosier": "cozier",
313
+ "cosies": "cozies",
314
+ "cosiest": "coziest",
315
+ "cosily": "cozily",
316
+ "cosiness": "coziness",
317
+ "cosy": "cozy",
318
+ "councillor": "councilor",
319
+ "councillors": "councilors",
320
+ "counselled": "counseled",
321
+ "counselling": "counseling",
322
+ "counsellor": "counselor",
323
+ "counsellors": "counselors",
324
+ "crenelated": "crenellated",
325
+ "criminalise": "criminalize",
326
+ "criminalised": "criminalized",
327
+ "criminalises": "criminalizes",
328
+ "criminalising": "criminalizing",
329
+ "criticise": "criticize",
330
+ "criticised": "criticized",
331
+ "criticises": "criticizes",
332
+ "criticising": "criticizing",
333
+ "crueller": "crueler",
334
+ "cruellest": "cruelest",
335
+ "crystallisation": "crystallization",
336
+ "crystallise": "crystallize",
337
+ "crystallised": "crystallized",
338
+ "crystallises": "crystallizes",
339
+ "crystallising": "crystallizing",
340
+ "cudgelled": "cudgeled",
341
+ "cudgelling": "cudgeling",
342
+ "customise": "customize",
343
+ "customised": "customized",
344
+ "customises": "customizes",
345
+ "customising": "customizing",
346
+ "cypher": "cipher",
347
+ "cyphers": "ciphers",
348
+ "decentralisation": "decentralization",
349
+ "decentralise": "decentralize",
350
+ "decentralised": "decentralized",
351
+ "decentralises": "decentralizes",
352
+ "decentralising": "decentralizing",
353
+ "decriminalisation": "decriminalization",
354
+ "decriminalise": "decriminalize",
355
+ "decriminalised": "decriminalized",
356
+ "decriminalises": "decriminalizes",
357
+ "decriminalising": "decriminalizing",
358
+ "defence": "defense",
359
+ "defenceless": "defenseless",
360
+ "defences": "defenses",
361
+ "dehumanisation": "dehumanization",
362
+ "dehumanise": "dehumanize",
363
+ "dehumanised": "dehumanized",
364
+ "dehumanises": "dehumanizes",
365
+ "dehumanising": "dehumanizing",
366
+ "demeanour": "demeanor",
367
+ "demilitarisation": "demilitarization",
368
+ "demilitarise": "demilitarize",
369
+ "demilitarised": "demilitarized",
370
+ "demilitarises": "demilitarizes",
371
+ "demilitarising": "demilitarizing",
372
+ "demobilisation": "demobilization",
373
+ "demobilise": "demobilize",
374
+ "demobilised": "demobilized",
375
+ "demobilises": "demobilizes",
376
+ "demobilising": "demobilizing",
377
+ "democratisation": "democratization",
378
+ "democratise": "democratize",
379
+ "democratised": "democratized",
380
+ "democratises": "democratizes",
381
+ "democratising": "democratizing",
382
+ "demonise": "demonize",
383
+ "demonised": "demonized",
384
+ "demonises": "demonizes",
385
+ "demonising": "demonizing",
386
+ "demoralisation": "demoralization",
387
+ "demoralise": "demoralize",
388
+ "demoralised": "demoralized",
389
+ "demoralises": "demoralizes",
390
+ "demoralising": "demoralizing",
391
+ "denationalisation": "denationalization",
392
+ "denationalise": "denationalize",
393
+ "denationalised": "denationalized",
394
+ "denationalises": "denationalizes",
395
+ "denationalising": "denationalizing",
396
+ "deodorise": "deodorize",
397
+ "deodorised": "deodorized",
398
+ "deodorises": "deodorizes",
399
+ "deodorising": "deodorizing",
400
+ "depersonalise": "depersonalize",
401
+ "depersonalised": "depersonalized",
402
+ "depersonalises": "depersonalizes",
403
+ "depersonalising": "depersonalizing",
404
+ "deputise": "deputize",
405
+ "deputised": "deputized",
406
+ "deputises": "deputizes",
407
+ "deputising": "deputizing",
408
+ "desensitisation": "desensitization",
409
+ "desensitise": "desensitize",
410
+ "desensitised": "desensitized",
411
+ "desensitises": "desensitizes",
412
+ "desensitising": "desensitizing",
413
+ "destabilisation": "destabilization",
414
+ "destabilise": "destabilize",
415
+ "destabilised": "destabilized",
416
+ "destabilises": "destabilizes",
417
+ "destabilising": "destabilizing",
418
+ "dialled": "dialed",
419
+ "dialling": "dialing",
420
+ "dialogue": "dialog",
421
+ "dialogues": "dialogs",
422
+ "diarrhoea": "diarrhea",
423
+ "digitise": "digitize",
424
+ "digitised": "digitized",
425
+ "digitises": "digitizes",
426
+ "digitising": "digitizing",
427
+ "disc": "disk",
428
+ "discolour": "discolor",
429
+ "discoloured": "discolored",
430
+ "discolouring": "discoloring",
431
+ "discolours": "discolors",
432
+ "discs": "disks",
433
+ "disembowelled": "disemboweled",
434
+ "disembowelling": "disemboweling",
435
+ "disfavour": "disfavor",
436
+ "dishevelled": "disheveled",
437
+ "dishonour": "dishonor",
438
+ "dishonourable": "dishonorable",
439
+ "dishonourably": "dishonorably",
440
+ "dishonoured": "dishonored",
441
+ "dishonouring": "dishonoring",
442
+ "dishonours": "dishonors",
443
+ "disorganisation": "disorganization",
444
+ "disorganised": "disorganized",
445
+ "distil": "distill",
446
+ "distils": "distills",
447
+ "dramatisation": "dramatization",
448
+ "dramatisations": "dramatizations",
449
+ "dramatise": "dramatize",
450
+ "dramatised": "dramatized",
451
+ "dramatises": "dramatizes",
452
+ "dramatising": "dramatizing",
453
+ "draught": "draft",
454
+ "draughtboard": "draftboard",
455
+ "draughtboards": "draftboards",
456
+ "draughtier": "draftier",
457
+ "draughtiest": "draftiest",
458
+ "draughts": "drafts",
459
+ "draughtsman": "draftsman",
460
+ "draughtsmanship": "draftsmanship",
461
+ "draughtsmen": "draftsmen",
462
+ "draughtswoman": "draftswoman",
463
+ "draughtswomen": "draftswomen",
464
+ "draughty": "drafty",
465
+ "drivelled": "driveled",
466
+ "drivelling": "driveling",
467
+ "duelled": "dueled",
468
+ "duelling": "dueling",
469
+ "economise": "economize",
470
+ "economised": "economized",
471
+ "economises": "economizes",
472
+ "economising": "economizing",
473
+ "editorialise": "editorialize",
474
+ "editorialised": "editorialized",
475
+ "editorialises": "editorializes",
476
+ "editorialising": "editorializing",
477
+ "edoema": "edema",
478
+ "empathise": "empathize",
479
+ "empathised": "empathized",
480
+ "empathises": "empathizes",
481
+ "empathising": "empathizing",
482
+ "emphasise": "emphasize",
483
+ "emphasised": "emphasized",
484
+ "emphasises": "emphasizes",
485
+ "emphasising": "emphasizing",
486
+ "enamelled": "enameled",
487
+ "enamelling": "enameling",
488
+ "enamoured": "enamored",
489
+ "encyclopaedia": "encyclopedia",
490
+ "encyclopaedias": "encyclopedias",
491
+ "encyclopaedic": "encyclopedic",
492
+ "endeavour": "endeavor",
493
+ "endeavoured": "endeavored",
494
+ "endeavouring": "endeavoring",
495
+ "endeavours": "endeavors",
496
+ "energise": "energize",
497
+ "energised": "energized",
498
+ "energises": "energizes",
499
+ "energising": "energizing",
500
+ "enrol": "enroll",
501
+ "enrols": "enrolls",
502
+ "enthral": "enthrall",
503
+ "enthrals": "enthralls",
504
+ "epaulette": "epaulet",
505
+ "epaulettes": "epaulets",
506
+ "epicentre": "epicenter",
507
+ "epicentres": "epicenters",
508
+ "epilogue": "epilog",
509
+ "epilogues": "epilogs",
510
+ "epitomise": "epitomize",
511
+ "epitomised": "epitomized",
512
+ "epitomises": "epitomizes",
513
+ "epitomising": "epitomizing",
514
+ "equalisation": "equalization",
515
+ "equalise": "equalize",
516
+ "equalised": "equalized",
517
+ "equaliser": "equalizer",
518
+ "equalisers": "equalizers",
519
+ "equalises": "equalizes",
520
+ "equalising": "equalizing",
521
+ "eulogise": "eulogize",
522
+ "eulogised": "eulogized",
523
+ "eulogises": "eulogizes",
524
+ "eulogising": "eulogizing",
525
+ "evangelise": "evangelize",
526
+ "evangelised": "evangelized",
527
+ "evangelises": "evangelizes",
528
+ "evangelising": "evangelizing",
529
+ "exorcise": "exorcize",
530
+ "exorcised": "exorcized",
531
+ "exorcises": "exorcizes",
532
+ "exorcising": "exorcizing",
533
+ "extemporisation": "extemporization",
534
+ "extemporise": "extemporize",
535
+ "extemporised": "extemporized",
536
+ "extemporises": "extemporizes",
537
+ "extemporising": "extemporizing",
538
+ "externalisation": "externalization",
539
+ "externalisations": "externalizations",
540
+ "externalise": "externalize",
541
+ "externalised": "externalized",
542
+ "externalises": "externalizes",
543
+ "externalising": "externalizing",
544
+ "factorise": "factorize",
545
+ "factorised": "factorized",
546
+ "factorises": "factorizes",
547
+ "factorising": "factorizing",
548
+ "faecal": "fecal",
549
+ "faeces": "feces",
550
+ "familiarisation": "familiarization",
551
+ "familiarise": "familiarize",
552
+ "familiarised": "familiarized",
553
+ "familiarises": "familiarizes",
554
+ "familiarising": "familiarizing",
555
+ "fantasise": "fantasize",
556
+ "fantasised": "fantasized",
557
+ "fantasises": "fantasizes",
558
+ "fantasising": "fantasizing",
559
+ "favour": "favor",
560
+ "favourable": "favorable",
561
+ "favourably": "favorably",
562
+ "favoured": "favored",
563
+ "favouring": "favoring",
564
+ "favourite": "favorite",
565
+ "favourites": "favorites",
566
+ "favouritism": "favoritism",
567
+ "favours": "favors",
568
+ "feminise": "feminize",
569
+ "feminised": "feminized",
570
+ "feminises": "feminizes",
571
+ "feminising": "feminizing",
572
+ "fertilisation": "fertilization",
573
+ "fertilise": "fertilize",
574
+ "fertilised": "fertilized",
575
+ "fertiliser": "fertilizer",
576
+ "fertilisers": "fertilizers",
577
+ "fertilises": "fertilizes",
578
+ "fertilising": "fertilizing",
579
+ "fervour": "fervor",
580
+ "fibre": "fiber",
581
+ "fibreglass": "fiberglass",
582
+ "fibres": "fibers",
583
+ "fictionalisation": "fictionalization",
584
+ "fictionalisations": "fictionalizations",
585
+ "fictionalise": "fictionalize",
586
+ "fictionalised": "fictionalized",
587
+ "fictionalises": "fictionalizes",
588
+ "fictionalising": "fictionalizing",
589
+ "fillet": "filet",
590
+ "filleted": "fileted",
591
+ "filleting": "fileting",
592
+ "fillets": "filets",
593
+ "finalisation": "finalization",
594
+ "finalise": "finalize",
595
+ "finalised": "finalized",
596
+ "finalises": "finalizes",
597
+ "finalising": "finalizing",
598
+ "flautist": "flutist",
599
+ "flautists": "flutists",
600
+ "flavour": "flavor",
601
+ "flavoured": "flavored",
602
+ "flavouring": "flavoring",
603
+ "flavourings": "flavorings",
604
+ "flavourless": "flavorless",
605
+ "flavours": "flavors",
606
+ "flavoursome": "flavorsome",
607
+ "flyer / flier": "flier / flyer",
608
+ "foetal": "fetal",
609
+ "foetid": "fetid",
610
+ "foetus": "fetus",
611
+ "foetuses": "fetuses",
612
+ "formalisation": "formalization",
613
+ "formalise": "formalize",
614
+ "formalised": "formalized",
615
+ "formalises": "formalizes",
616
+ "formalising": "formalizing",
617
+ "fossilisation": "fossilization",
618
+ "fossilise": "fossilize",
619
+ "fossilised": "fossilized",
620
+ "fossilises": "fossilizes",
621
+ "fossilising": "fossilizing",
622
+ "fraternisation": "fraternization",
623
+ "fraternise": "fraternize",
624
+ "fraternised": "fraternized",
625
+ "fraternises": "fraternizes",
626
+ "fraternising": "fraternizing",
627
+ "fulfil": "fulfill",
628
+ "fulfilment": "fulfillment",
629
+ "fulfils": "fulfills",
630
+ "funnelled": "funneled",
631
+ "funnelling": "funneling",
632
+ "gage": "gauge",
633
+ "gaged": "gauged",
634
+ "gages": "gauges",
635
+ "gaging": "gauging",
636
+ "galvanise": "galvanize",
637
+ "galvanised": "galvanized",
638
+ "galvanises": "galvanizes",
639
+ "galvanising": "galvanizing",
640
+ "gambolled": "gamboled",
641
+ "gambolling": "gamboling",
642
+ "gaol": "jail",
643
+ "gaolbird": "jailbird",
644
+ "gaolbirds": "jailbirds",
645
+ "gaolbreak": "jailbreak",
646
+ "gaolbreaks": "jailbreaks",
647
+ "gaoled": "jailed",
648
+ "gaoler": "jailer",
649
+ "gaolers": "jailers",
650
+ "gaoling": "jailing",
651
+ "gaols": "jails",
652
+ "gasses": "gases",
653
+ "generalisation": "generalization",
654
+ "generalisations": "generalizations",
655
+ "generalise": "generalize",
656
+ "generalised": "generalized",
657
+ "generalises": "generalizes",
658
+ "generalising": "generalizing",
659
+ "ghettoise": "ghettoize",
660
+ "ghettoised": "ghettoized",
661
+ "ghettoises": "ghettoizes",
662
+ "ghettoising": "ghettoizing",
663
+ "gipsies": "gypsies",
664
+ "glamor": "glamour",
665
+ "glamorise": "glamorize",
666
+ "glamorised": "glamorized",
667
+ "glamorises": "glamorizes",
668
+ "glamorising": "glamorizing",
669
+ "globalisation": "globalization",
670
+ "globalise": "globalize",
671
+ "globalised": "globalized",
672
+ "globalises": "globalizes",
673
+ "globalising": "globalizing",
674
+ "glueing": "gluing",
675
+ "goitre": "goiter",
676
+ "goitres": "goiters",
677
+ "gonorrhoea": "gonorrhea",
678
+ "gramme": "gram",
679
+ "grammes": "grams",
680
+ "gravelled": "graveled",
681
+ "grey": "gray",
682
+ "greyed": "grayed",
683
+ "greying": "graying",
684
+ "greyish": "grayish",
685
+ "greyness": "grayness",
686
+ "greys": "grays",
687
+ "grovelled": "groveled",
688
+ "grovelling": "groveling",
689
+ "groyne": "groin",
690
+ "groynes": "groins",
691
+ "gruelling": "grueling",
692
+ "gruellingly": "gruelingly",
693
+ "gryphon": "griffin",
694
+ "gryphons": "griffins",
695
+ "gynaecological": "gynecological",
696
+ "gynaecologist": "gynecologist",
697
+ "gynaecologists": "gynecologists",
698
+ "gynaecology": "gynecology",
699
+ "haematological": "hematological",
700
+ "haematologist": "hematologist",
701
+ "haematologists": "hematologists",
702
+ "haematology": "hematology",
703
+ "haemoglobin": "hemoglobin",
704
+ "haemophilia": "hemophilia",
705
+ "haemophiliac": "hemophiliac",
706
+ "haemophiliacs": "hemophiliacs",
707
+ "haemorrhage": "hemorrhage",
708
+ "haemorrhaged": "hemorrhaged",
709
+ "haemorrhages": "hemorrhages",
710
+ "haemorrhaging": "hemorrhaging",
711
+ "haemorrhoids": "hemorrhoids",
712
+ "harbour": "harbor",
713
+ "harboured": "harbored",
714
+ "harbouring": "harboring",
715
+ "harbours": "harbors",
716
+ "harmonisation": "harmonization",
717
+ "harmonise": "harmonize",
718
+ "harmonised": "harmonized",
719
+ "harmonises": "harmonizes",
720
+ "harmonising": "harmonizing",
721
+ "homoeopath": "homeopath",
722
+ "homoeopathic": "homeopathic",
723
+ "homoeopaths": "homeopaths",
724
+ "homoeopathy": "homeopathy",
725
+ "homogenise": "homogenize",
726
+ "homogenised": "homogenized",
727
+ "homogenises": "homogenizes",
728
+ "homogenising": "homogenizing",
729
+ "honour": "honor",
730
+ "honourable": "honorable",
731
+ "honourably": "honorably",
732
+ "honoured": "honored",
733
+ "honouring": "honoring",
734
+ "honours": "honors",
735
+ "hospitalisation": "hospitalization",
736
+ "hospitalise": "hospitalize",
737
+ "hospitalised": "hospitalized",
738
+ "hospitalises": "hospitalizes",
739
+ "hospitalising": "hospitalizing",
740
+ "humanise": "humanize",
741
+ "humanised": "humanized",
742
+ "humanises": "humanizes",
743
+ "humanising": "humanizing",
744
+ "humour": "humor",
745
+ "humoured": "humored",
746
+ "humouring": "humoring",
747
+ "humourless": "humorless",
748
+ "humours": "humors",
749
+ "hybridise": "hybridize",
750
+ "hybridised": "hybridized",
751
+ "hybridises": "hybridizes",
752
+ "hybridising": "hybridizing",
753
+ "hypnotise": "hypnotize",
754
+ "hypnotised": "hypnotized",
755
+ "hypnotises": "hypnotizes",
756
+ "hypnotising": "hypnotizing",
757
+ "hypothesise": "hypothesize",
758
+ "hypothesised": "hypothesized",
759
+ "hypothesises": "hypothesizes",
760
+ "hypothesising": "hypothesizing",
761
+ "idealisation": "idealization",
762
+ "idealise": "idealize",
763
+ "idealised": "idealized",
764
+ "idealises": "idealizes",
765
+ "idealising": "idealizing",
766
+ "idolise": "idolize",
767
+ "idolised": "idolized",
768
+ "idolises": "idolizes",
769
+ "idolising": "idolizing",
770
+ "immobilisation": "immobilization",
771
+ "immobilise": "immobilize",
772
+ "immobilised": "immobilized",
773
+ "immobiliser": "immobilizer",
774
+ "immobilisers": "immobilizers",
775
+ "immobilises": "immobilizes",
776
+ "immobilising": "immobilizing",
777
+ "immortalise": "immortalize",
778
+ "immortalised": "immortalized",
779
+ "immortalises": "immortalizes",
780
+ "immortalising": "immortalizing",
781
+ "immunisation": "immunization",
782
+ "immunise": "immunize",
783
+ "immunised": "immunized",
784
+ "immunises": "immunizes",
785
+ "immunising": "immunizing",
786
+ "impanelled": "impaneled",
787
+ "impanelling": "impaneling",
788
+ "imperilled": "imperiled",
789
+ "imperilling": "imperiling",
790
+ "individualise": "individualize",
791
+ "individualised": "individualized",
792
+ "individualises": "individualizes",
793
+ "individualising": "individualizing",
794
+ "industrialise": "industrialize",
795
+ "industrialised": "industrialized",
796
+ "industrialises": "industrializes",
797
+ "industrialising": "industrializing",
798
+ "inflexion": "inflection",
799
+ "inflexions": "inflections",
800
+ "initialise": "initialize",
801
+ "initialised": "initialized",
802
+ "initialises": "initializes",
803
+ "initialising": "initializing",
804
+ "initialled": "initialed",
805
+ "initialling": "initialing",
806
+ "instal": "install",
807
+ "instalment": "installment",
808
+ "instalments": "installments",
809
+ "instals": "installs",
810
+ "instil": "instill",
811
+ "instils": "instills",
812
+ "institutionalisation": "institutionalization",
813
+ "institutionalise": "institutionalize",
814
+ "institutionalised": "institutionalized",
815
+ "institutionalises": "institutionalizes",
816
+ "institutionalising": "institutionalizing",
817
+ "intellectualise": "intellectualize",
818
+ "intellectualised": "intellectualized",
819
+ "intellectualises": "intellectualizes",
820
+ "intellectualising": "intellectualizing",
821
+ "internalisation": "internalization",
822
+ "internalise": "internalize",
823
+ "internalised": "internalized",
824
+ "internalises": "internalizes",
825
+ "internalising": "internalizing",
826
+ "internationalisation": "internationalization",
827
+ "internationalise": "internationalize",
828
+ "internationalised": "internationalized",
829
+ "internationalises": "internationalizes",
830
+ "internationalising": "internationalizing",
831
+ "ionisation": "ionization",
832
+ "ionise": "ionize",
833
+ "ionised": "ionized",
834
+ "ioniser": "ionizer",
835
+ "ionisers": "ionizers",
836
+ "ionises": "ionizes",
837
+ "ionising": "ionizing",
838
+ "italicise": "italicize",
839
+ "italicised": "italicized",
840
+ "italicises": "italicizes",
841
+ "italicising": "italicizing",
842
+ "itemise": "itemize",
843
+ "itemised": "itemized",
844
+ "itemises": "itemizes",
845
+ "itemising": "itemizing",
846
+ "jeopardise": "jeopardize",
847
+ "jeopardised": "jeopardized",
848
+ "jeopardises": "jeopardizes",
849
+ "jeopardising": "jeopardizing",
850
+ "jewelled": "jeweled",
851
+ "jeweller": "jeweler",
852
+ "jewellers": "jewelers",
853
+ "jewellery": "jewelry",
854
+ "judgement": "judgment",
855
+ "kilogramme": "kilogram",
856
+ "kilogrammes": "kilograms",
857
+ "kilometre": "kilometer",
858
+ "kilometres": "kilometers",
859
+ "labelled": "labeled",
860
+ "labelling": "labeling",
861
+ "labour": "labor",
862
+ "laboured": "labored",
863
+ "labourer": "laborer",
864
+ "labourers": "laborers",
865
+ "labouring": "laboring",
866
+ "labours": "labors",
867
+ "lacklustre": "lackluster",
868
+ "legalisation": "legalization",
869
+ "legalise": "legalize",
870
+ "legalised": "legalized",
871
+ "legalises": "legalizes",
872
+ "legalising": "legalizing",
873
+ "legitimise": "legitimize",
874
+ "legitimised": "legitimized",
875
+ "legitimises": "legitimizes",
876
+ "legitimising": "legitimizing",
877
+ "leukaemia": "leukemia",
878
+ "levelled": "leveled",
879
+ "leveller": "leveler",
880
+ "levellers": "levelers",
881
+ "levelling": "leveling",
882
+ "libelled": "libeled",
883
+ "libelling": "libeling",
884
+ "libellous": "libelous",
885
+ "liberalisation": "liberalization",
886
+ "liberalise": "liberalize",
887
+ "liberalised": "liberalized",
888
+ "liberalises": "liberalizes",
889
+ "liberalising": "liberalizing",
890
+ "licence": "license",
891
+ "licenced": "licensed",
892
+ "licences": "licenses",
893
+ "licencing": "licensing",
894
+ "likeable": "likable",
895
+ "lionisation": "lionization",
896
+ "lionise": "lionize",
897
+ "lionised": "lionized",
898
+ "lionises": "lionizes",
899
+ "lionising": "lionizing",
900
+ "liquidise": "liquidize",
901
+ "liquidised": "liquidized",
902
+ "liquidiser": "liquidizer",
903
+ "liquidisers": "liquidizers",
904
+ "liquidises": "liquidizes",
905
+ "liquidising": "liquidizing",
906
+ "litre": "liter",
907
+ "litres": "liters",
908
+ "localise": "localize",
909
+ "localised": "localized",
910
+ "localises": "localizes",
911
+ "localising": "localizing",
912
+ "louvre": "louver",
913
+ "louvred": "louvered",
914
+ "louvres": "louvers",
915
+ "lustre": "luster",
916
+ "magnetise": "magnetize",
917
+ "magnetised": "magnetized",
918
+ "magnetises": "magnetizes",
919
+ "magnetising": "magnetizing",
920
+ "manoeuvrability": "maneuverability",
921
+ "manoeuvrable": "maneuverable",
922
+ "manoeuvre": "maneuver",
923
+ "manoeuvred": "maneuvered",
924
+ "manoeuvres": "maneuvers",
925
+ "manoeuvring": "maneuvering",
926
+ "manoeuvrings": "maneuverings",
927
+ "marginalisation": "marginalization",
928
+ "marginalise": "marginalize",
929
+ "marginalised": "marginalized",
930
+ "marginalises": "marginalizes",
931
+ "marginalising": "marginalizing",
932
+ "marshalled": "marshaled",
933
+ "marshalling": "marshaling",
934
+ "marvelled": "marveled",
935
+ "marvelling": "marveling",
936
+ "marvellous": "marvelous",
937
+ "marvellously": "marvelously",
938
+ "materialisation": "materialization",
939
+ "materialise": "materialize",
940
+ "materialised": "materialized",
941
+ "materialises": "materializes",
942
+ "materialising": "materializing",
943
+ "maximisation": "maximization",
944
+ "maximise": "maximize",
945
+ "maximised": "maximized",
946
+ "maximises": "maximizes",
947
+ "maximising": "maximizing",
948
+ "meagre": "meager",
949
+ "mechanisation": "mechanization",
950
+ "mechanise": "mechanize",
951
+ "mechanised": "mechanized",
952
+ "mechanises": "mechanizes",
953
+ "mechanising": "mechanizing",
954
+ "mediaeval": "medieval",
955
+ "memorialise": "memorialize",
956
+ "memorialised": "memorialized",
957
+ "memorialises": "memorializes",
958
+ "memorialising": "memorializing",
959
+ "memorise": "memorize",
960
+ "memorised": "memorized",
961
+ "memorises": "memorizes",
962
+ "memorising": "memorizing",
963
+ "mesmerise": "mesmerize",
964
+ "mesmerised": "mesmerized",
965
+ "mesmerises": "mesmerizes",
966
+ "mesmerising": "mesmerizing",
967
+ "metabolise": "metabolize",
968
+ "metabolised": "metabolized",
969
+ "metabolises": "metabolizes",
970
+ "metabolising": "metabolizing",
971
+ "metre": "meter",
972
+ "metres": "meters",
973
+ "mhm": "hmm",
974
+ "micrometre": "micrometer",
975
+ "micrometres": "micrometers",
976
+ "militarise": "militarize",
977
+ "militarised": "militarized",
978
+ "militarises": "militarizes",
979
+ "militarising": "militarizing",
980
+ "milligramme": "milligram",
981
+ "milligrammes": "milligrams",
982
+ "millilitre": "milliliter",
983
+ "millilitres": "milliliters",
984
+ "millimetre": "millimeter",
985
+ "millimetres": "millimeters",
986
+ "miniaturisation": "miniaturization",
987
+ "miniaturise": "miniaturize",
988
+ "miniaturised": "miniaturized",
989
+ "miniaturises": "miniaturizes",
990
+ "miniaturising": "miniaturizing",
991
+ "minibusses": "minibuses",
992
+ "minimise": "minimize",
993
+ "minimised": "minimized",
994
+ "minimises": "minimizes",
995
+ "minimising": "minimizing",
996
+ "misbehaviour": "misbehavior",
997
+ "misdemeanour": "misdemeanor",
998
+ "misdemeanours": "misdemeanors",
999
+ "misspelt": "misspelled",
1000
+ "mitre": "miter",
1001
+ "mitres": "miters",
1002
+ "mm": "hmm",
1003
+ "mmm": "hmm",
1004
+ "mobilisation": "mobilization",
1005
+ "mobilise": "mobilize",
1006
+ "mobilised": "mobilized",
1007
+ "mobilises": "mobilizes",
1008
+ "mobilising": "mobilizing",
1009
+ "modelled": "modeled",
1010
+ "modeller": "modeler",
1011
+ "modellers": "modelers",
1012
+ "modelling": "modeling",
1013
+ "modernise": "modernize",
1014
+ "modernised": "modernized",
1015
+ "modernises": "modernizes",
1016
+ "modernising": "modernizing",
1017
+ "moisturise": "moisturize",
1018
+ "moisturised": "moisturized",
1019
+ "moisturiser": "moisturizer",
1020
+ "moisturisers": "moisturizers",
1021
+ "moisturises": "moisturizes",
1022
+ "moisturising": "moisturizing",
1023
+ "monologue": "monolog",
1024
+ "monologues": "monologs",
1025
+ "monopolisation": "monopolization",
1026
+ "monopolise": "monopolize",
1027
+ "monopolised": "monopolized",
1028
+ "monopolises": "monopolizes",
1029
+ "monopolising": "monopolizing",
1030
+ "moralise": "moralize",
1031
+ "moralised": "moralized",
1032
+ "moralises": "moralizes",
1033
+ "moralising": "moralizing",
1034
+ "motorised": "motorized",
1035
+ "mould": "mold",
1036
+ "moulded": "molded",
1037
+ "moulder": "molder",
1038
+ "mouldered": "moldered",
1039
+ "mouldering": "moldering",
1040
+ "moulders": "molders",
1041
+ "mouldier": "moldier",
1042
+ "mouldiest": "moldiest",
1043
+ "moulding": "molding",
1044
+ "mouldings": "moldings",
1045
+ "moulds": "molds",
1046
+ "mouldy": "moldy",
1047
+ "moult": "molt",
1048
+ "moulted": "molted",
1049
+ "moulting": "molting",
1050
+ "moults": "molts",
1051
+ "moustache": "mustache",
1052
+ "moustached": "mustached",
1053
+ "moustaches": "mustaches",
1054
+ "moustachioed": "mustachioed",
1055
+ "multicoloured": "multicolored",
1056
+ "nationalisation": "nationalization",
1057
+ "nationalisations": "nationalizations",
1058
+ "nationalise": "nationalize",
1059
+ "nationalised": "nationalized",
1060
+ "nationalises": "nationalizes",
1061
+ "nationalising": "nationalizing",
1062
+ "naturalisation": "naturalization",
1063
+ "naturalise": "naturalize",
1064
+ "naturalised": "naturalized",
1065
+ "naturalises": "naturalizes",
1066
+ "naturalising": "naturalizing",
1067
+ "neighbour": "neighbor",
1068
+ "neighbourhood": "neighborhood",
1069
+ "neighbourhoods": "neighborhoods",
1070
+ "neighbouring": "neighboring",
1071
+ "neighbourliness": "neighborliness",
1072
+ "neighbourly": "neighborly",
1073
+ "neighbours": "neighbors",
1074
+ "neutralisation": "neutralization",
1075
+ "neutralise": "neutralize",
1076
+ "neutralised": "neutralized",
1077
+ "neutralises": "neutralizes",
1078
+ "neutralising": "neutralizing",
1079
+ "normalisation": "normalization",
1080
+ "normalise": "normalize",
1081
+ "normalised": "normalized",
1082
+ "normalises": "normalizes",
1083
+ "normalising": "normalizing",
1084
+ "odour": "odor",
1085
+ "odourless": "odorless",
1086
+ "odours": "odors",
1087
+ "oesophagus": "esophagus",
1088
+ "oesophaguses": "esophaguses",
1089
+ "oestrogen": "estrogen",
1090
+ "offence": "offense",
1091
+ "offences": "offenses",
1092
+ "omelette": "omelet",
1093
+ "omelettes": "omelets",
1094
+ "optimise": "optimize",
1095
+ "optimised": "optimized",
1096
+ "optimises": "optimizes",
1097
+ "optimising": "optimizing",
1098
+ "organisation": "organization",
1099
+ "organisational": "organizational",
1100
+ "organisations": "organizations",
1101
+ "organise": "organize",
1102
+ "organised": "organized",
1103
+ "organiser": "organizer",
1104
+ "organisers": "organizers",
1105
+ "organises": "organizes",
1106
+ "organising": "organizing",
1107
+ "orthopaedic": "orthopedic",
1108
+ "orthopaedics": "orthopedics",
1109
+ "ostracise": "ostracize",
1110
+ "ostracised": "ostracized",
1111
+ "ostracises": "ostracizes",
1112
+ "ostracising": "ostracizing",
1113
+ "outmanoeuvre": "outmaneuver",
1114
+ "outmanoeuvred": "outmaneuvered",
1115
+ "outmanoeuvres": "outmaneuvers",
1116
+ "outmanoeuvring": "outmaneuvering",
1117
+ "overemphasise": "overemphasize",
1118
+ "overemphasised": "overemphasized",
1119
+ "overemphasises": "overemphasizes",
1120
+ "overemphasising": "overemphasizing",
1121
+ "oxidisation": "oxidization",
1122
+ "oxidise": "oxidize",
1123
+ "oxidised": "oxidized",
1124
+ "oxidises": "oxidizes",
1125
+ "oxidising": "oxidizing",
1126
+ "paederast": "pederast",
1127
+ "paederasts": "pederasts",
1128
+ "paediatric": "pediatric",
1129
+ "paediatrician": "pediatrician",
1130
+ "paediatricians": "pediatricians",
1131
+ "paediatrics": "pediatrics",
1132
+ "paedophile": "pedophile",
1133
+ "paedophiles": "pedophiles",
1134
+ "paedophilia": "pedophilia",
1135
+ "palaeolithic": "paleolithic",
1136
+ "palaeontologist": "paleontologist",
1137
+ "palaeontologists": "paleontologists",
1138
+ "palaeontology": "paleontology",
1139
+ "panelled": "paneled",
1140
+ "panelling": "paneling",
1141
+ "panellist": "panelist",
1142
+ "panellists": "panelists",
1143
+ "paralyse": "paralyze",
1144
+ "paralysed": "paralyzed",
1145
+ "paralyses": "paralyzes",
1146
+ "paralysing": "paralyzing",
1147
+ "parcelled": "parceled",
1148
+ "parcelling": "parceling",
1149
+ "parlour": "parlor",
1150
+ "parlours": "parlors",
1151
+ "particularise": "particularize",
1152
+ "particularised": "particularized",
1153
+ "particularises": "particularizes",
1154
+ "particularising": "particularizing",
1155
+ "passivisation": "passivization",
1156
+ "passivise": "passivize",
1157
+ "passivised": "passivized",
1158
+ "passivises": "passivizes",
1159
+ "passivising": "passivizing",
1160
+ "pasteurisation": "pasteurization",
1161
+ "pasteurise": "pasteurize",
1162
+ "pasteurised": "pasteurized",
1163
+ "pasteurises": "pasteurizes",
1164
+ "pasteurising": "pasteurizing",
1165
+ "patronise": "patronize",
1166
+ "patronised": "patronized",
1167
+ "patronises": "patronizes",
1168
+ "patronising": "patronizing",
1169
+ "patronisingly": "patronizingly",
1170
+ "pedalled": "pedaled",
1171
+ "pedalling": "pedaling",
1172
+ "pedestrianisation": "pedestrianization",
1173
+ "pedestrianise": "pedestrianize",
1174
+ "pedestrianised": "pedestrianized",
1175
+ "pedestrianises": "pedestrianizes",
1176
+ "pedestrianising": "pedestrianizing",
1177
+ "penalise": "penalize",
1178
+ "penalised": "penalized",
1179
+ "penalises": "penalizes",
1180
+ "penalising": "penalizing",
1181
+ "pencilled": "penciled",
1182
+ "pencilling": "penciling",
1183
+ "personalise": "personalize",
1184
+ "personalised": "personalized",
1185
+ "personalises": "personalizes",
1186
+ "personalising": "personalizing",
1187
+ "pharmacopoeia": "pharmacopeia",
1188
+ "pharmacopoeias": "pharmacopeias",
1189
+ "philosophise": "philosophize",
1190
+ "philosophised": "philosophized",
1191
+ "philosophises": "philosophizes",
1192
+ "philosophising": "philosophizing",
1193
+ "philtre": "filter",
1194
+ "philtres": "filters",
1195
+ "phoney": "phony",
1196
+ "plagiarise": "plagiarize",
1197
+ "plagiarised": "plagiarized",
1198
+ "plagiarises": "plagiarizes",
1199
+ "plagiarising": "plagiarizing",
1200
+ "plough": "plow",
1201
+ "ploughed": "plowed",
1202
+ "ploughing": "plowing",
1203
+ "ploughman": "plowman",
1204
+ "ploughmen": "plowmen",
1205
+ "ploughs": "plows",
1206
+ "ploughshare": "plowshare",
1207
+ "ploughshares": "plowshares",
1208
+ "polarisation": "polarization",
1209
+ "polarise": "polarize",
1210
+ "polarised": "polarized",
1211
+ "polarises": "polarizes",
1212
+ "polarising": "polarizing",
1213
+ "politicisation": "politicization",
1214
+ "politicise": "politicize",
1215
+ "politicised": "politicized",
1216
+ "politicises": "politicizes",
1217
+ "politicising": "politicizing",
1218
+ "popularisation": "popularization",
1219
+ "popularise": "popularize",
1220
+ "popularised": "popularized",
1221
+ "popularises": "popularizes",
1222
+ "popularising": "popularizing",
1223
+ "pouffe": "pouf",
1224
+ "pouffes": "poufs",
1225
+ "practise": "practice",
1226
+ "practised": "practiced",
1227
+ "practises": "practices",
1228
+ "practising": "practicing",
1229
+ "praesidium": "presidium",
1230
+ "praesidiums": "presidiums",
1231
+ "pressurisation": "pressurization",
1232
+ "pressurise": "pressurize",
1233
+ "pressurised": "pressurized",
1234
+ "pressurises": "pressurizes",
1235
+ "pressurising": "pressurizing",
1236
+ "pretence": "pretense",
1237
+ "pretences": "pretenses",
1238
+ "primaeval": "primeval",
1239
+ "prioritisation": "prioritization",
1240
+ "prioritise": "prioritize",
1241
+ "prioritised": "prioritized",
1242
+ "prioritises": "prioritizes",
1243
+ "prioritising": "prioritizing",
1244
+ "privatisation": "privatization",
1245
+ "privatisations": "privatizations",
1246
+ "privatise": "privatize",
1247
+ "privatised": "privatized",
1248
+ "privatises": "privatizes",
1249
+ "privatising": "privatizing",
1250
+ "professionalisation": "professionalization",
1251
+ "professionalise": "professionalize",
1252
+ "professionalised": "professionalized",
1253
+ "professionalises": "professionalizes",
1254
+ "professionalising": "professionalizing",
1255
+ "programme": "program",
1256
+ "programmes": "programs",
1257
+ "prologue": "prolog",
1258
+ "prologues": "prologs",
1259
+ "propagandise": "propagandize",
1260
+ "propagandised": "propagandized",
1261
+ "propagandises": "propagandizes",
1262
+ "propagandising": "propagandizing",
1263
+ "proselytise": "proselytize",
1264
+ "proselytised": "proselytized",
1265
+ "proselytiser": "proselytizer",
1266
+ "proselytisers": "proselytizers",
1267
+ "proselytises": "proselytizes",
1268
+ "proselytising": "proselytizing",
1269
+ "psychoanalyse": "psychoanalyze",
1270
+ "psychoanalysed": "psychoanalyzed",
1271
+ "psychoanalyses": "psychoanalyzes",
1272
+ "psychoanalysing": "psychoanalyzing",
1273
+ "publicise": "publicize",
1274
+ "publicised": "publicized",
1275
+ "publicises": "publicizes",
1276
+ "publicising": "publicizing",
1277
+ "pulverisation": "pulverization",
1278
+ "pulverise": "pulverize",
1279
+ "pulverised": "pulverized",
1280
+ "pulverises": "pulverizes",
1281
+ "pulverising": "pulverizing",
1282
+ "pummelled": "pummel",
1283
+ "pummelling": "pummeled",
1284
+ "pyjama": "pajama",
1285
+ "pyjamas": "pajamas",
1286
+ "pzazz": "pizzazz",
1287
+ "quarrelled": "quarreled",
1288
+ "quarrelling": "quarreling",
1289
+ "radicalise": "radicalize",
1290
+ "radicalised": "radicalized",
1291
+ "radicalises": "radicalizes",
1292
+ "radicalising": "radicalizing",
1293
+ "rancour": "rancor",
1294
+ "randomise": "randomize",
1295
+ "randomised": "randomized",
1296
+ "randomises": "randomizes",
1297
+ "randomising": "randomizing",
1298
+ "rationalisation": "rationalization",
1299
+ "rationalisations": "rationalizations",
1300
+ "rationalise": "rationalize",
1301
+ "rationalised": "rationalized",
1302
+ "rationalises": "rationalizes",
1303
+ "rationalising": "rationalizing",
1304
+ "ravelled": "raveled",
1305
+ "ravelling": "raveling",
1306
+ "realisable": "realizable",
1307
+ "realisation": "realization",
1308
+ "realisations": "realizations",
1309
+ "realise": "realize",
1310
+ "realised": "realized",
1311
+ "realises": "realizes",
1312
+ "realising": "realizing",
1313
+ "recognisable": "recognizable",
1314
+ "recognisably": "recognizably",
1315
+ "recognisance": "recognizance",
1316
+ "recognise": "recognize",
1317
+ "recognised": "recognized",
1318
+ "recognises": "recognizes",
1319
+ "recognising": "recognizing",
1320
+ "reconnoitre": "reconnoiter",
1321
+ "reconnoitred": "reconnoitered",
1322
+ "reconnoitres": "reconnoiters",
1323
+ "reconnoitring": "reconnoitering",
1324
+ "refuelled": "refueled",
1325
+ "refuelling": "refueling",
1326
+ "regularisation": "regularization",
1327
+ "regularise": "regularize",
1328
+ "regularised": "regularized",
1329
+ "regularises": "regularizes",
1330
+ "regularising": "regularizing",
1331
+ "remodelled": "remodeled",
1332
+ "remodelling": "remodeling",
1333
+ "remould": "remold",
1334
+ "remoulded": "remolded",
1335
+ "remoulding": "remolding",
1336
+ "remoulds": "remolds",
1337
+ "reorganisation": "reorganization",
1338
+ "reorganisations": "reorganizations",
1339
+ "reorganise": "reorganize",
1340
+ "reorganised": "reorganized",
1341
+ "reorganises": "reorganizes",
1342
+ "reorganising": "reorganizing",
1343
+ "revelled": "reveled",
1344
+ "reveller": "reveler",
1345
+ "revellers": "revelers",
1346
+ "revelling": "reveling",
1347
+ "revitalise": "revitalize",
1348
+ "revitalised": "revitalized",
1349
+ "revitalises": "revitalizes",
1350
+ "revitalising": "revitalizing",
1351
+ "revolutionise": "revolutionize",
1352
+ "revolutionised": "revolutionized",
1353
+ "revolutionises": "revolutionizes",
1354
+ "revolutionising": "revolutionizing",
1355
+ "rhapsodise": "rhapsodize",
1356
+ "rhapsodised": "rhapsodized",
1357
+ "rhapsodises": "rhapsodizes",
1358
+ "rhapsodising": "rhapsodizing",
1359
+ "rigour": "rigor",
1360
+ "rigours": "rigors",
1361
+ "ritualised": "ritualized",
1362
+ "rivalled": "rivaled",
1363
+ "rivalling": "rivaling",
1364
+ "romanticise": "romanticize",
1365
+ "romanticised": "romanticized",
1366
+ "romanticises": "romanticizes",
1367
+ "romanticising": "romanticizing",
1368
+ "rumour": "rumor",
1369
+ "rumoured": "rumored",
1370
+ "rumours": "rumors",
1371
+ "sabre": "saber",
1372
+ "sabres": "sabers",
1373
+ "saltpetre": "saltpeter",
1374
+ "sanitise": "sanitize",
1375
+ "sanitised": "sanitized",
1376
+ "sanitises": "sanitizes",
1377
+ "sanitising": "sanitizing",
1378
+ "satirise": "satirize",
1379
+ "satirised": "satirized",
1380
+ "satirises": "satirizes",
1381
+ "satirising": "satirizing",
1382
+ "saviour": "savior",
1383
+ "saviours": "saviors",
1384
+ "savour": "savor",
1385
+ "savoured": "savored",
1386
+ "savouries": "savories",
1387
+ "savouring": "savoring",
1388
+ "savours": "savors",
1389
+ "savoury": "savory",
1390
+ "scandalise": "scandalize",
1391
+ "scandalised": "scandalized",
1392
+ "scandalises": "scandalizes",
1393
+ "scandalising": "scandalizing",
1394
+ "sceptic": "skeptic",
1395
+ "sceptical": "skeptical",
1396
+ "sceptically": "skeptically",
1397
+ "scepticism": "skepticism",
1398
+ "sceptics": "skeptics",
1399
+ "sceptre": "scepter",
1400
+ "sceptres": "scepters",
1401
+ "scrutinise": "scrutinize",
1402
+ "scrutinised": "scrutinized",
1403
+ "scrutinises": "scrutinizes",
1404
+ "scrutinising": "scrutinizing",
1405
+ "secularisation": "secularization",
1406
+ "secularise": "secularize",
1407
+ "secularised": "secularized",
1408
+ "secularises": "secularizes",
1409
+ "secularising": "secularizing",
1410
+ "sensationalise": "sensationalize",
1411
+ "sensationalised": "sensationalized",
1412
+ "sensationalises": "sensationalizes",
1413
+ "sensationalising": "sensationalizing",
1414
+ "sensitise": "sensitize",
1415
+ "sensitised": "sensitized",
1416
+ "sensitises": "sensitizes",
1417
+ "sensitising": "sensitizing",
1418
+ "sentimentalise": "sentimentalize",
1419
+ "sentimentalised": "sentimentalized",
1420
+ "sentimentalises": "sentimentalizes",
1421
+ "sentimentalising": "sentimentalizing",
1422
+ "sepulchre": "sepulcher",
1423
+ "sepulchres": "sepulchers",
1424
+ "serialisation": "serialization",
1425
+ "serialisations": "serializations",
1426
+ "serialise": "serialize",
1427
+ "serialised": "serialized",
1428
+ "serialises": "serializes",
1429
+ "serialising": "serializing",
1430
+ "sermonise": "sermonize",
1431
+ "sermonised": "sermonized",
1432
+ "sermonises": "sermonizes",
1433
+ "sermonising": "sermonizing",
1434
+ "sheikh": "sheik",
1435
+ "shovelled": "shoveled",
1436
+ "shovelling": "shoveling",
1437
+ "shrivelled": "shriveled",
1438
+ "shrivelling": "shriveling",
1439
+ "signalise": "signalize",
1440
+ "signalised": "signalized",
1441
+ "signalises": "signalizes",
1442
+ "signalising": "signalizing",
1443
+ "signalled": "signaled",
1444
+ "signalling": "signaling",
1445
+ "smoulder": "smolder",
1446
+ "smouldered": "smoldered",
1447
+ "smouldering": "smoldering",
1448
+ "smoulders": "smolders",
1449
+ "snivelled": "sniveled",
1450
+ "snivelling": "sniveling",
1451
+ "snorkelled": "snorkeled",
1452
+ "snorkelling": "snorkeling",
1453
+ "snowplough": "snowplow",
1454
+ "snowploughs": "snowplow",
1455
+ "socialisation": "socialization",
1456
+ "socialise": "socialize",
1457
+ "socialised": "socialized",
1458
+ "socialises": "socializes",
1459
+ "socialising": "socializing",
1460
+ "sodomise": "sodomize",
1461
+ "sodomised": "sodomized",
1462
+ "sodomises": "sodomizes",
1463
+ "sodomising": "sodomizing",
1464
+ "solemnise": "solemnize",
1465
+ "solemnised": "solemnized",
1466
+ "solemnises": "solemnizes",
1467
+ "solemnising": "solemnizing",
1468
+ "sombre": "somber",
1469
+ "specialisation": "specialization",
1470
+ "specialisations": "specializations",
1471
+ "specialise": "specialize",
1472
+ "specialised": "specialized",
1473
+ "specialises": "specializes",
1474
+ "specialising": "specializing",
1475
+ "spectre": "specter",
1476
+ "spectres": "specters",
1477
+ "spiralled": "spiraled",
1478
+ "spiralling": "spiraling",
1479
+ "splendour": "splendor",
1480
+ "splendours": "splendors",
1481
+ "squirrelled": "squirreled",
1482
+ "squirrelling": "squirreling",
1483
+ "stabilisation": "stabilization",
1484
+ "stabilise": "stabilize",
1485
+ "stabilised": "stabilized",
1486
+ "stabiliser": "stabilizer",
1487
+ "stabilisers": "stabilizers",
1488
+ "stabilises": "stabilizes",
1489
+ "stabilising": "stabilizing",
1490
+ "standardisation": "standardization",
1491
+ "standardise": "standardize",
1492
+ "standardised": "standardized",
1493
+ "standardises": "standardizes",
1494
+ "standardising": "standardizing",
1495
+ "stencilled": "stenciled",
1496
+ "stencilling": "stenciling",
1497
+ "sterilisation": "sterilization",
1498
+ "sterilisations": "sterilizations",
1499
+ "sterilise": "sterilize",
1500
+ "sterilised": "sterilized",
1501
+ "steriliser": "sterilizer",
1502
+ "sterilisers": "sterilizers",
1503
+ "sterilises": "sterilizes",
1504
+ "sterilising": "sterilizing",
1505
+ "stigmatisation": "stigmatization",
1506
+ "stigmatise": "stigmatize",
1507
+ "stigmatised": "stigmatized",
1508
+ "stigmatises": "stigmatizes",
1509
+ "stigmatising": "stigmatizing",
1510
+ "storey": "story",
1511
+ "storeys": "stories",
1512
+ "subsidisation": "subsidization",
1513
+ "subsidise": "subsidize",
1514
+ "subsidised": "subsidized",
1515
+ "subsidiser": "subsidizer",
1516
+ "subsidisers": "subsidizers",
1517
+ "subsidises": "subsidizes",
1518
+ "subsidising": "subsidizing",
1519
+ "succour": "succor",
1520
+ "succoured": "succored",
1521
+ "succouring": "succoring",
1522
+ "succours": "succors",
1523
+ "sulphate": "sulfate",
1524
+ "sulphates": "sulfates",
1525
+ "sulphide": "sulfide",
1526
+ "sulphides": "sulfides",
1527
+ "sulphur": "sulfur",
1528
+ "sulphurous": "sulfurous",
1529
+ "summarise": "summarize",
1530
+ "summarised": "summarized",
1531
+ "summarises": "summarizes",
1532
+ "summarising": "summarizing",
1533
+ "swivelled": "swiveled",
1534
+ "swivelling": "swiveling",
1535
+ "symbolise": "symbolize",
1536
+ "symbolised": "symbolized",
1537
+ "symbolises": "symbolizes",
1538
+ "symbolising": "symbolizing",
1539
+ "sympathise": "sympathize",
1540
+ "sympathised": "sympathized",
1541
+ "sympathiser": "sympathizer",
1542
+ "sympathisers": "sympathizers",
1543
+ "sympathises": "sympathizes",
1544
+ "sympathising": "sympathizing",
1545
+ "synchronisation": "synchronization",
1546
+ "synchronise": "synchronize",
1547
+ "synchronised": "synchronized",
1548
+ "synchronises": "synchronizes",
1549
+ "synchronising": "synchronizing",
1550
+ "synthesise": "synthesize",
1551
+ "synthesised": "synthesized",
1552
+ "synthesiser": "synthesizer",
1553
+ "synthesisers": "synthesizers",
1554
+ "synthesises": "synthesizes",
1555
+ "synthesising": "synthesizing",
1556
+ "syphon": "siphon",
1557
+ "syphoned": "siphoned",
1558
+ "syphoning": "siphoning",
1559
+ "syphons": "siphons",
1560
+ "systematisation": "systematization",
1561
+ "systematise": "systematize",
1562
+ "systematised": "systematized",
1563
+ "systematises": "systematizes",
1564
+ "systematising": "systematizing",
1565
+ "tantalise": "tantalize",
1566
+ "tantalised": "tantalized",
1567
+ "tantalises": "tantalizes",
1568
+ "tantalising": "tantalizing",
1569
+ "tantalisingly": "tantalizingly",
1570
+ "tasselled": "tasseled",
1571
+ "technicolour": "technicolor",
1572
+ "temporise": "temporize",
1573
+ "temporised": "temporized",
1574
+ "temporises": "temporizes",
1575
+ "temporising": "temporizing",
1576
+ "tenderise": "tenderize",
1577
+ "tenderised": "tenderized",
1578
+ "tenderises": "tenderizes",
1579
+ "tenderising": "tenderizing",
1580
+ "terrorise": "terrorize",
1581
+ "terrorised": "terrorized",
1582
+ "terrorises": "terrorizes",
1583
+ "terrorising": "terrorizing",
1584
+ "theatre": "theater",
1585
+ "theatregoer": "theatergoer",
1586
+ "theatregoers": "theatergoers",
1587
+ "theatres": "theaters",
1588
+ "theorise": "theorize",
1589
+ "theorised": "theorized",
1590
+ "theorises": "theorizes",
1591
+ "theorising": "theorizing",
1592
+ "tonne": "ton",
1593
+ "tonnes": "tons",
1594
+ "towelled": "toweled",
1595
+ "towelling": "toweling",
1596
+ "toxaemia": "toxemia",
1597
+ "tranquillise": "tranquilize",
1598
+ "tranquillised": "tranquilized",
1599
+ "tranquilliser": "tranquilizer",
1600
+ "tranquillisers": "tranquilizers",
1601
+ "tranquillises": "tranquilizes",
1602
+ "tranquillising": "tranquilizing",
1603
+ "tranquillity": "tranquility",
1604
+ "tranquillize": "tranquilize",
1605
+ "tranquillized": "tranquilized",
1606
+ "tranquillizer": "tranquilizer",
1607
+ "tranquillizers": "tranquilizers",
1608
+ "tranquillizes": "tranquilizes",
1609
+ "tranquillizing": "tranquilizing",
1610
+ "tranquilly": "tranquility",
1611
+ "transistorised": "transistorized",
1612
+ "traumatise": "traumatize",
1613
+ "traumatised": "traumatized",
1614
+ "traumatises": "traumatizes",
1615
+ "traumatising": "traumatizing",
1616
+ "travelled": "traveled",
1617
+ "traveller": "traveler",
1618
+ "travellers": "travelers",
1619
+ "travelling": "traveling",
1620
+ "travelog": "travelogue",
1621
+ "travelogs": "travelogues",
1622
+ "trialled": "trialed",
1623
+ "trialling": "trialing",
1624
+ "tricolour": "tricolor",
1625
+ "tricolours": "tricolors",
1626
+ "trivialise": "trivialize",
1627
+ "trivialised": "trivialized",
1628
+ "trivialises": "trivializes",
1629
+ "trivialising": "trivializing",
1630
+ "tumour": "tumor",
1631
+ "tumours": "tumors",
1632
+ "tunnelled": "tunneled",
1633
+ "tunnelling": "tunneling",
1634
+ "tyrannise": "tyrannize",
1635
+ "tyrannised": "tyrannized",
1636
+ "tyrannises": "tyrannizes",
1637
+ "tyrannising": "tyrannizing",
1638
+ "tyre": "tire",
1639
+ "tyres": "tires",
1640
+ "unauthorised": "unauthorized",
1641
+ "uncivilised": "uncivilized",
1642
+ "underutilised": "underutilized",
1643
+ "unequalled": "unequaled",
1644
+ "unfavourable": "unfavorable",
1645
+ "unfavourably": "unfavorably",
1646
+ "unionisation": "unionization",
1647
+ "unionise": "unionize",
1648
+ "unionised": "unionized",
1649
+ "unionises": "unionizes",
1650
+ "unionising": "unionizing",
1651
+ "unorganised": "unorganized",
1652
+ "unravelled": "unraveled",
1653
+ "unravelling": "unraveling",
1654
+ "unrecognisable": "unrecognizable",
1655
+ "unrecognised": "unrecognized",
1656
+ "unrivalled": "unrivaled",
1657
+ "unsavoury": "unsavory",
1658
+ "untrammelled": "untrammeled",
1659
+ "urbanisation": "urbanization",
1660
+ "urbanise": "urbanize",
1661
+ "urbanised": "urbanized",
1662
+ "urbanises": "urbanizes",
1663
+ "urbanising": "urbanizing",
1664
+ "utilisable": "utilizable",
1665
+ "utilisation": "utilization",
1666
+ "utilise": "utilize",
1667
+ "utilised": "utilized",
1668
+ "utilises": "utilizes",
1669
+ "utilising": "utilizing",
1670
+ "valour": "valor",
1671
+ "vandalise": "vandalize",
1672
+ "vandalised": "vandalized",
1673
+ "vandalises": "vandalizes",
1674
+ "vandalising": "vandalizing",
1675
+ "vaporisation": "vaporization",
1676
+ "vaporise": "vaporize",
1677
+ "vaporised": "vaporized",
1678
+ "vaporises": "vaporizes",
1679
+ "vaporising": "vaporizing",
1680
+ "vapour": "vapor",
1681
+ "vapours": "vapors",
1682
+ "verbalise": "verbalize",
1683
+ "verbalised": "verbalized",
1684
+ "verbalises": "verbalizes",
1685
+ "verbalising": "verbalizing",
1686
+ "victimisation": "victimization",
1687
+ "victimise": "victimize",
1688
+ "victimised": "victimized",
1689
+ "victimises": "victimizes",
1690
+ "victimising": "victimizing",
1691
+ "videodisc": "videodisk",
1692
+ "videodiscs": "videodisks",
1693
+ "vigour": "vigor",
1694
+ "visualisation": "visualization",
1695
+ "visualisations": "visualizations",
1696
+ "visualise": "visualize",
1697
+ "visualised": "visualized",
1698
+ "visualises": "visualizes",
1699
+ "visualising": "visualizing",
1700
+ "vocalisation": "vocalization",
1701
+ "vocalisations": "vocalizations",
1702
+ "vocalise": "vocalize",
1703
+ "vocalised": "vocalized",
1704
+ "vocalises": "vocalizes",
1705
+ "vocalising": "vocalizing",
1706
+ "vulcanised": "vulcanized",
1707
+ "vulgarisation": "vulgarization",
1708
+ "vulgarise": "vulgarize",
1709
+ "vulgarised": "vulgarized",
1710
+ "vulgarises": "vulgarizes",
1711
+ "vulgarising": "vulgarizing",
1712
+ "waggon": "wagon",
1713
+ "waggons": "wagons",
1714
+ "watercolour": "watercolor",
1715
+ "watercolours": "watercolors",
1716
+ "weaselled": "weaseled",
1717
+ "weaselling": "weaseling",
1718
+ "westernisation": "westernization",
1719
+ "westernise": "westernize",
1720
+ "westernised": "westernized",
1721
+ "westernises": "westernizes",
1722
+ "westernising": "westernizing",
1723
+ "womanise": "womanize",
1724
+ "womanised": "womanized",
1725
+ "womaniser": "womanizer",
1726
+ "womanisers": "womanizers",
1727
+ "womanises": "womanizes",
1728
+ "womanising": "womanizing",
1729
+ "woollen": "woolen",
1730
+ "woollens": "woolens",
1731
+ "woollies": "woolies",
1732
+ "woolly": "wooly",
1733
+ "worshipped": "worshiped",
1734
+ "worshipper": "worshiper",
1735
+ "worshipping": "worshiping",
1736
+ "yodelled": "yodeled",
1737
+ "yodelling": "yodeling",
1738
+ "yoghourt": "yogurt",
1739
+ "yoghourts": "yogurts",
1740
+ "yoghurt": "yogurt",
1741
+ "yoghurts": "yogurts"
1742
+ }
whisper_pipeline/faster-whisper-main/benchmark/requirements.benchmark.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers
2
+ jiwer
3
+ evaluate
4
+ datasets
5
+ memory_profiler
6
+ py3nvml
whisper_pipeline/faster-whisper-main/benchmark/speed_benchmark.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import timeit
3
+
4
+ from typing import Callable
5
+
6
+ from utils import inference
7
+
8
+ parser = argparse.ArgumentParser(description="Speed benchmark")
9
+ parser.add_argument(
10
+ "--repeat",
11
+ type=int,
12
+ default=3,
13
+ help="Times an experiment will be run.",
14
+ )
15
+ args = parser.parse_args()
16
+
17
+
18
+ def measure_speed(func: Callable[[], None]):
19
+ # as written in https://docs.python.org/3/library/timeit.html#timeit.Timer.repeat,
20
+ # min should be taken rather than the average
21
+ runtimes = timeit.repeat(
22
+ func,
23
+ repeat=args.repeat,
24
+ number=10,
25
+ )
26
+ print(runtimes)
27
+ print("Min execution time: %.3fs" % (min(runtimes) / 10.0))
28
+
29
+
30
+ if __name__ == "__main__":
31
+ measure_speed(inference)
whisper_pipeline/faster-whisper-main/benchmark/utils.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from threading import Thread
4
+ from typing import Optional
5
+
6
+ from faster_whisper import WhisperModel
7
+
8
+ model_path = "large-v3"
9
+ model = WhisperModel(model_path, device="cuda")
10
+
11
+
12
+ def inference():
13
+ segments, info = model.transcribe("benchmark.m4a", language="fr")
14
+ for segment in segments:
15
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
16
+
17
+
18
+ def get_logger(name: Optional[str] = None) -> logging.Logger:
19
+ formatter = logging.Formatter("%(levelname)s: %(message)s")
20
+ logger = logging.getLogger(name)
21
+ logger.setLevel(logging.DEBUG)
22
+ handler = logging.StreamHandler()
23
+ handler.setFormatter(formatter)
24
+ logger.addHandler(handler)
25
+ return logger
26
+
27
+
28
+ class MyThread(Thread):
29
+ def __init__(self, func, params):
30
+ super(MyThread, self).__init__()
31
+ self.func = func
32
+ self.params = params
33
+ self.result = None
34
+
35
+ def run(self):
36
+ self.result = self.func(*self.params)
37
+
38
+ def get_result(self):
39
+ return self.result
whisper_pipeline/faster-whisper-main/benchmark/wer_benchmark.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ from datasets import load_dataset
6
+ from evaluate import load
7
+ from tqdm import tqdm
8
+ from transformers.models.whisper.english_normalizer import EnglishTextNormalizer
9
+
10
+ from faster_whisper import WhisperModel
11
+
12
+ parser = argparse.ArgumentParser(description="WER benchmark")
13
+ parser.add_argument(
14
+ "--audio_numb",
15
+ type=int,
16
+ default=None,
17
+ help="Specify the number of validation audio files in the dataset."
18
+ " Set to None to retrieve all audio files.",
19
+ )
20
+ args = parser.parse_args()
21
+
22
+ model_path = "large-v3"
23
+ model = WhisperModel(model_path, device="cuda")
24
+
25
+ # load the dataset with streaming mode
26
+ dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)
27
+
28
+ # define the evaluation metric
29
+ wer_metric = load("wer")
30
+
31
+ with open(os.path.join(os.path.dirname(__file__), "normalizer.json"), "r") as f:
32
+ normalizer = EnglishTextNormalizer(json.load(f))
33
+
34
+
35
+ def inference(batch):
36
+ batch["transcription"] = []
37
+ for sample in batch["audio"]:
38
+ segments, info = model.transcribe(sample["array"], language="en")
39
+ batch["transcription"].append("".join([segment.text for segment in segments]))
40
+ batch["reference"] = batch["text"]
41
+ return batch
42
+
43
+
44
+ dataset = dataset.map(function=inference, batched=True, batch_size=16)
45
+
46
+ all_transcriptions = []
47
+ all_references = []
48
+
49
+ # iterate over the dataset and run inference
50
+ for i, result in tqdm(enumerate(dataset), desc="Evaluating..."):
51
+ all_transcriptions.append(result["transcription"])
52
+ all_references.append(result["reference"])
53
+ if args.audio_numb and i == (args.audio_numb - 1):
54
+ break
55
+
56
+ # normalize predictions and references
57
+ all_transcriptions = [normalizer(transcription) for transcription in all_transcriptions]
58
+ all_references = [normalizer(reference) for reference in all_references]
59
+
60
+ # compute the WER metric
61
+ wer = 100 * wer_metric.compute(
62
+ predictions=all_transcriptions, references=all_references
63
+ )
64
+ print("WER: %.3f" % wer)
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper.audio import decode_audio
2
+ from faster_whisper.transcribe import BatchedInferencePipeline, WhisperModel
3
+ from faster_whisper.utils import available_models, download_model, format_timestamp
4
+ from faster_whisper.version import __version__
5
+
6
+ __all__ = [
7
+ "available_models",
8
+ "decode_audio",
9
+ "WhisperModel",
10
+ "BatchedInferencePipeline",
11
+ "download_model",
12
+ "format_timestamp",
13
+ "__version__",
14
+ ]
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/__init__.py ADDED
File without changes
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/pyannote_vad_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea
3
+ size 17719103
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/assets/silero_vad.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b99cbfd39246b6706f98ec13c7c50c6b299181f2474fa05cbc8046acc274396
3
+ size 2313101
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/audio.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import BinaryIO, Union
2
+
3
+ import torch
4
+ import torchaudio
5
+
6
+
7
+ def decode_audio(
8
+ input_file: Union[str, BinaryIO],
9
+ sampling_rate: int = 16000,
10
+ split_stereo: bool = False,
11
+ ):
12
+ """Decodes the audio.
13
+
14
+ Args:
15
+ input_file: Path to the input file or a file-like object.
16
+ sampling_rate: Resample the audio to this sample rate.
17
+ split_stereo: Return separate left and right channels.
18
+
19
+ Returns:
20
+ A float32 Torch Tensor.
21
+
22
+ If `split_stereo` is enabled, the function returns a 2-tuple with the
23
+ separated left and right channels.
24
+ """
25
+
26
+ waveform, audio_sf = torchaudio.load(input_file) # waveform: channels X T
27
+
28
+ if audio_sf != sampling_rate:
29
+ waveform = torchaudio.functional.resample(
30
+ waveform, orig_freq=audio_sf, new_freq=sampling_rate
31
+ )
32
+ if split_stereo:
33
+ return waveform[0], waveform[1]
34
+
35
+ return waveform.mean(0)
36
+
37
+
38
+ def pad_or_trim(array, length: int, *, axis: int = -1):
39
+ """
40
+ Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
41
+ """
42
+ axis = axis % array.ndim
43
+ if array.shape[axis] > length:
44
+ idx = [Ellipsis] * axis + [slice(length)] + [Ellipsis] * (array.ndim - axis - 1)
45
+ return array[idx]
46
+
47
+ if array.shape[axis] < length:
48
+ pad_widths = (
49
+ [
50
+ 0,
51
+ ]
52
+ * array.ndim
53
+ * 2
54
+ )
55
+ pad_widths[2 * axis] = length - array.shape[axis]
56
+ array = torch.nn.functional.pad(array, tuple(pad_widths[::-1]))
57
+
58
+ return array
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/feature_extractor.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py # noqa: E501
5
+ class FeatureExtractor:
6
+ def __init__(
7
+ self,
8
+ device: str = "auto",
9
+ feature_size=80,
10
+ sampling_rate=16000,
11
+ hop_length=160,
12
+ chunk_length=30,
13
+ n_fft=400,
14
+ ):
15
+ if device == "auto":
16
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ else:
18
+ self.device = device
19
+ self.n_fft = n_fft
20
+ self.hop_length = hop_length
21
+ self.chunk_length = chunk_length
22
+ self.n_samples = chunk_length * sampling_rate
23
+ self.nb_max_frames = self.n_samples // hop_length
24
+ self.time_per_frame = hop_length / sampling_rate
25
+ self.sampling_rate = sampling_rate
26
+ self.mel_filters = self.get_mel_filters(
27
+ sampling_rate, n_fft, n_mels=feature_size
28
+ )
29
+
30
+ @staticmethod
31
+ def get_mel_filters(sr, n_fft, n_mels=128):
32
+ """
33
+ Implementation of librosa.filters.mel in Pytorch
34
+ """
35
+ # Initialize the weights
36
+ n_mels = int(n_mels)
37
+
38
+ # Center freqs of each FFT bin
39
+ fftfreqs = torch.fft.rfftfreq(n=n_fft, d=1.0 / sr)
40
+
41
+ # 'Center freqs' of mel bands - uniformly spaced between limits
42
+ min_mel = 0.0
43
+ max_mel = 45.245640471924965
44
+
45
+ mels = torch.linspace(min_mel, max_mel, n_mels + 2)
46
+
47
+ # Fill in the linear scale
48
+ f_min = 0.0
49
+ f_sp = 200.0 / 3
50
+ freqs = f_min + f_sp * mels
51
+
52
+ # And now the nonlinear scale
53
+ min_log_hz = 1000.0 # beginning of log region (Hz)
54
+ min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
55
+ logstep = torch.log(torch.tensor(6.4)) / 27.0 # step size for log region
56
+
57
+ # If we have vector data, vectorize
58
+ log_t = mels >= min_log_mel
59
+ freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
60
+
61
+ mel_f = freqs
62
+
63
+ fdiff = torch.diff(mel_f)
64
+ ramps = mel_f.view(-1, 1) - fftfreqs.view(1, -1)
65
+
66
+ lower = -ramps[:-2] / fdiff[:-1].unsqueeze(1)
67
+ upper = ramps[2:] / fdiff[1:].unsqueeze(1)
68
+
69
+ # Intersect them with each other and zero, vectorized across all i
70
+ weights = torch.maximum(torch.zeros_like(lower), torch.minimum(lower, upper))
71
+
72
+ # Slaney-style mel is scaled to be approx constant energy per channel
73
+ enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
74
+ weights *= enorm.unsqueeze(1)
75
+
76
+ return weights
77
+
78
+ def __call__(self, waveform, padding=True, chunk_length=None, to_cpu=False):
79
+ """
80
+ Compute the log-Mel spectrogram of the provided audio.
81
+ """
82
+
83
+ if chunk_length is not None:
84
+ self.n_samples = chunk_length * self.sampling_rate
85
+ self.nb_max_frames = self.n_samples // self.hop_length
86
+
87
+ if waveform.dtype is not torch.float32:
88
+ waveform = waveform.to(torch.float32)
89
+
90
+ waveform = (
91
+ waveform.to(self.device)
92
+ if self.device == "cuda" and not waveform.is_cuda
93
+ else waveform
94
+ )
95
+
96
+ if padding:
97
+ waveform = torch.nn.functional.pad(waveform, (0, self.n_samples))
98
+
99
+ window = torch.hann_window(self.n_fft).to(waveform.device)
100
+
101
+ stft = torch.stft(
102
+ waveform, self.n_fft, self.hop_length, window=window, return_complex=True
103
+ )
104
+ magnitudes = stft[..., :-1].abs() ** 2
105
+
106
+ mel_spec = self.mel_filters.to(waveform.device) @ magnitudes
107
+
108
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
109
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
110
+ log_spec = (log_spec + 4.0) / 4.0
111
+
112
+ # When the model is running on multiple GPUs, the output should be moved
113
+ # to the CPU since we don't know which GPU will handle the next job.
114
+ return log_spec.cpu() if to_cpu else log_spec
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/tokenizer.py ADDED
@@ -0,0 +1,314 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import string
2
+
3
+ from functools import cached_property
4
+ from typing import List, Optional, Tuple
5
+
6
+ import tokenizers
7
+
8
+
9
+ class Tokenizer:
10
+ """Simple wrapper around a tokenizers.Tokenizer."""
11
+
12
+ def __init__(
13
+ self,
14
+ tokenizer: tokenizers.Tokenizer,
15
+ multilingual: bool,
16
+ task: Optional[str] = None,
17
+ language: Optional[str] = None,
18
+ ):
19
+ self.tokenizer = tokenizer
20
+
21
+ if multilingual:
22
+ if task not in _TASKS:
23
+ raise ValueError(
24
+ "'%s' is not a valid task (accepted tasks: %s)"
25
+ % (task, ", ".join(_TASKS))
26
+ )
27
+
28
+ if language not in _LANGUAGE_CODES:
29
+ raise ValueError(
30
+ "'%s' is not a valid language code (accepted language codes: %s)"
31
+ % (language, ", ".join(_LANGUAGE_CODES))
32
+ )
33
+
34
+ self.task = self.tokenizer.token_to_id("<|%s|>" % task)
35
+ self.language = self.tokenizer.token_to_id("<|%s|>" % language)
36
+ self.language_code = language
37
+ else:
38
+ self.task = None
39
+ self.language = None
40
+ self.language_code = "en"
41
+
42
+ @cached_property
43
+ def transcribe(self) -> int:
44
+ return self.tokenizer.token_to_id("<|transcribe|>")
45
+
46
+ @cached_property
47
+ def translate(self) -> int:
48
+ return self.tokenizer.token_to_id("<|translate|>")
49
+
50
+ @cached_property
51
+ def sot(self) -> int:
52
+ return self.tokenizer.token_to_id("<|startoftranscript|>")
53
+
54
+ @cached_property
55
+ def sot_lm(self) -> int:
56
+ return self.tokenizer.token_to_id("<|startoflm|>")
57
+
58
+ @cached_property
59
+ def sot_prev(self) -> int:
60
+ return self.tokenizer.token_to_id("<|startofprev|>")
61
+
62
+ @cached_property
63
+ def eot(self) -> int:
64
+ return self.tokenizer.token_to_id("<|endoftext|>")
65
+
66
+ @cached_property
67
+ def no_timestamps(self) -> int:
68
+ return self.tokenizer.token_to_id("<|notimestamps|>")
69
+
70
+ @property
71
+ def timestamp_begin(self) -> int:
72
+ return self.no_timestamps + 1
73
+
74
+ @property
75
+ def sot_sequence(self) -> List[int]:
76
+ sequence = [self.sot]
77
+
78
+ if self.language is not None:
79
+ sequence.append(self.language)
80
+
81
+ if self.task is not None:
82
+ sequence.append(self.task)
83
+
84
+ return sequence
85
+
86
+ def encode(self, text: str) -> List[int]:
87
+ return self.tokenizer.encode(text, add_special_tokens=False).ids
88
+
89
+ def decode(self, tokens: List[int]) -> str:
90
+ text_tokens = [token for token in tokens if token < self.eot]
91
+ return self.tokenizer.decode(text_tokens)
92
+
93
+ def decode_with_timestamps(self, tokens: List[int]) -> str:
94
+ outputs = [[]]
95
+
96
+ for token in tokens:
97
+ if token >= self.timestamp_begin:
98
+ timestamp = f"<|{(token - self.timestamp_begin) * 0.02:.2f}|>"
99
+ outputs.append(timestamp)
100
+ outputs.append([])
101
+ else:
102
+ outputs[-1].append(token)
103
+
104
+ return "".join(
105
+ [s if isinstance(s, str) else self.tokenizer.decode(s) for s in outputs]
106
+ )
107
+
108
+ @cached_property
109
+ def non_speech_tokens(self) -> Tuple[int]:
110
+ """
111
+ Returns the list of tokens to suppress in order to avoid any speaker tags or non-speech
112
+ annotations, to prevent sampling texts that are not actually spoken in the audio, e.g.
113
+
114
+ - ♪♪♪
115
+ - ( SPEAKING FOREIGN LANGUAGE )
116
+ - [DAVID] Hey there,
117
+
118
+ keeping basic punctuations like commas, periods, question marks, exclamation points, etc.
119
+ """
120
+ symbols = list('"#()*+/:;<=>@[\\]^_`{|}~「」『』')
121
+ symbols += (
122
+ "<< >> <<< >>> -- --- -( -[ (' (\" (( )) ((( ))) [[ ]] {{ }} ♪♪ ♪♪♪".split()
123
+ )
124
+
125
+ # symbols that may be a single token or multiple tokens depending on the tokenizer.
126
+ # In case they're multiple tokens, suppress the first token, which is safe because:
127
+ # These are between U+2640 and U+267F miscellaneous symbols that are okay to suppress
128
+ # in generations, and in the 3-byte UTF-8 representation they share the first two bytes.
129
+ miscellaneous = set("♩♪♫♬♭♮♯")
130
+ assert all(0x2640 <= ord(c) <= 0x267F for c in miscellaneous)
131
+
132
+ # allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
133
+ result = {self.encode(" -")[0], self.encode(" '")[0]}
134
+ for symbol in symbols + list(miscellaneous):
135
+ for tokens in [
136
+ self.encode(symbol),
137
+ self.encode(" " + symbol),
138
+ ]:
139
+ if len(tokens) == 1 or symbol in miscellaneous:
140
+ result.add(tokens[0])
141
+
142
+ return tuple(sorted(result))
143
+
144
+ def split_to_word_tokens(
145
+ self, tokens: List[int]
146
+ ) -> Tuple[List[str], List[List[int]]]:
147
+ if self.language_code in {"zh", "ja", "th", "lo", "my", "yue"}:
148
+ # These languages don't typically use spaces, so it is difficult to split words
149
+ # without morpheme analysis. Here, we instead split words at any
150
+ # position where the tokens are decoded as valid unicode points
151
+ return self.split_tokens_on_unicode(tokens)
152
+
153
+ return self.split_tokens_on_spaces(tokens)
154
+
155
+ def split_tokens_on_unicode(
156
+ self, tokens: List[int]
157
+ ) -> Tuple[List[str], List[List[int]]]:
158
+ decoded_full = self.decode_with_timestamps(tokens)
159
+ replacement_char = "\ufffd"
160
+
161
+ words = []
162
+ word_tokens = []
163
+ current_tokens = []
164
+ unicode_offset = 0
165
+
166
+ for token in tokens:
167
+ current_tokens.append(token)
168
+ decoded = self.decode_with_timestamps(current_tokens)
169
+
170
+ try:
171
+ replacement_char_index = decoded.index(replacement_char)
172
+ replacement_char_index += unicode_offset
173
+ except ValueError:
174
+ replacement_char_index = None
175
+
176
+ if replacement_char_index is None or (
177
+ replacement_char_index < len(decoded_full)
178
+ and decoded_full[replacement_char_index] == replacement_char
179
+ ):
180
+ words.append(decoded)
181
+ word_tokens.append(current_tokens)
182
+ current_tokens = []
183
+ unicode_offset += len(decoded)
184
+
185
+ return words, word_tokens
186
+
187
+ def split_tokens_on_spaces(
188
+ self, tokens: List[int]
189
+ ) -> Tuple[List[str], List[List[int]]]:
190
+ subwords, subword_tokens_list = self.split_tokens_on_unicode(tokens)
191
+ words = []
192
+ word_tokens = []
193
+
194
+ for subword, subword_tokens in zip(subwords, subword_tokens_list):
195
+ special = subword_tokens[0] >= self.eot
196
+ with_space = subword.startswith(" ")
197
+ punctuation = subword.strip() in string.punctuation
198
+ if special or with_space or punctuation or len(words) == 0:
199
+ words.append(subword)
200
+ word_tokens.append(subword_tokens)
201
+ else:
202
+ words[-1] = words[-1] + subword
203
+ word_tokens[-1].extend(subword_tokens)
204
+
205
+ return words, word_tokens
206
+
207
+
208
+ _TASKS = (
209
+ "transcribe",
210
+ "translate",
211
+ )
212
+
213
+ _LANGUAGE_CODES = (
214
+ "af",
215
+ "am",
216
+ "ar",
217
+ "as",
218
+ "az",
219
+ "ba",
220
+ "be",
221
+ "bg",
222
+ "bn",
223
+ "bo",
224
+ "br",
225
+ "bs",
226
+ "ca",
227
+ "cs",
228
+ "cy",
229
+ "da",
230
+ "de",
231
+ "el",
232
+ "en",
233
+ "es",
234
+ "et",
235
+ "eu",
236
+ "fa",
237
+ "fi",
238
+ "fo",
239
+ "fr",
240
+ "gl",
241
+ "gu",
242
+ "ha",
243
+ "haw",
244
+ "he",
245
+ "hi",
246
+ "hr",
247
+ "ht",
248
+ "hu",
249
+ "hy",
250
+ "id",
251
+ "is",
252
+ "it",
253
+ "ja",
254
+ "jw",
255
+ "ka",
256
+ "kk",
257
+ "km",
258
+ "kn",
259
+ "ko",
260
+ "la",
261
+ "lb",
262
+ "ln",
263
+ "lo",
264
+ "lt",
265
+ "lv",
266
+ "mg",
267
+ "mi",
268
+ "mk",
269
+ "ml",
270
+ "mn",
271
+ "mr",
272
+ "ms",
273
+ "mt",
274
+ "my",
275
+ "ne",
276
+ "nl",
277
+ "nn",
278
+ "no",
279
+ "oc",
280
+ "pa",
281
+ "pl",
282
+ "ps",
283
+ "pt",
284
+ "ro",
285
+ "ru",
286
+ "sa",
287
+ "sd",
288
+ "si",
289
+ "sk",
290
+ "sl",
291
+ "sn",
292
+ "so",
293
+ "sq",
294
+ "sr",
295
+ "su",
296
+ "sv",
297
+ "sw",
298
+ "ta",
299
+ "te",
300
+ "tg",
301
+ "th",
302
+ "tk",
303
+ "tl",
304
+ "tr",
305
+ "tt",
306
+ "uk",
307
+ "ur",
308
+ "uz",
309
+ "vi",
310
+ "yi",
311
+ "yo",
312
+ "zh",
313
+ "yue",
314
+ )
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/transcribe.py ADDED
@@ -0,0 +1,2170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import itertools
2
+ import json
3
+ import logging
4
+ import os
5
+ import random
6
+ import zlib
7
+
8
+ from collections import Counter, defaultdict
9
+ from inspect import signature
10
+ from typing import BinaryIO, Iterable, List, NamedTuple, Optional, Tuple, Union
11
+
12
+ import ctranslate2
13
+ import numpy as np
14
+ import tokenizers
15
+ import torch
16
+
17
+ from pyannote.audio import Model
18
+ from tqdm import tqdm
19
+
20
+ from faster_whisper.audio import decode_audio, pad_or_trim
21
+ from faster_whisper.feature_extractor import FeatureExtractor
22
+ from faster_whisper.tokenizer import _LANGUAGE_CODES, Tokenizer
23
+ from faster_whisper.utils import (
24
+ download_model,
25
+ format_timestamp,
26
+ get_assets_path,
27
+ get_end,
28
+ get_logger,
29
+ )
30
+ from faster_whisper.vad import (
31
+ SpeechTimestampsMap,
32
+ VadOptions,
33
+ VoiceActivitySegmentation,
34
+ collect_chunks,
35
+ get_speech_timestamps,
36
+ merge_chunks,
37
+ )
38
+
39
+
40
+ class Word(NamedTuple):
41
+ start: float
42
+ end: float
43
+ word: str
44
+ probability: float
45
+
46
+
47
+ class Segment(NamedTuple):
48
+ id: int
49
+ seek: int
50
+ start: float
51
+ end: float
52
+ text: str
53
+ tokens: List[int]
54
+ avg_logprob: float
55
+ compression_ratio: float
56
+ no_speech_prob: float
57
+ words: Optional[List[Word]]
58
+ temperature: Optional[float] = 1.0
59
+
60
+
61
+ # Added additional parameters for multilingual videos and fixes below
62
+ class TranscriptionOptions(NamedTuple):
63
+ beam_size: int
64
+ best_of: int
65
+ patience: float
66
+ length_penalty: float
67
+ repetition_penalty: float
68
+ no_repeat_ngram_size: int
69
+ log_prob_threshold: Optional[float]
70
+ log_prob_low_threshold: Optional[float]
71
+ no_speech_threshold: Optional[float]
72
+ compression_ratio_threshold: Optional[float]
73
+ condition_on_previous_text: bool
74
+ prompt_reset_on_temperature: float
75
+ temperatures: List[float]
76
+ initial_prompt: Optional[Union[str, Iterable[int]]]
77
+ prefix: Optional[str]
78
+ suppress_blank: bool
79
+ suppress_tokens: Optional[List[int]]
80
+ without_timestamps: bool
81
+ max_initial_timestamp: float
82
+ word_timestamps: bool
83
+ prepend_punctuations: str
84
+ append_punctuations: str
85
+ multilingual: bool
86
+ output_language: Optional[str]
87
+ max_new_tokens: Optional[int]
88
+ clip_timestamps: Union[str, List[float]]
89
+ hallucination_silence_threshold: Optional[float]
90
+ hotwords: Optional[str]
91
+
92
+
93
+ class TranscriptionInfo(NamedTuple):
94
+ language: str
95
+ language_probability: float
96
+ duration: float
97
+ duration_after_vad: float
98
+ all_language_probs: Optional[List[Tuple[str, float]]]
99
+ transcription_options: TranscriptionOptions
100
+ vad_options: VadOptions
101
+
102
+
103
+ # The code below is originally from HF pipeline and is used in whisper-x
104
+ # (https://github.com/m-bain/whisperX) and adapted for faster_whisper
105
+
106
+
107
+ class BatchedInferencePipeline:
108
+ """
109
+ Huggingface Pipeline wrapper for WhisperModel.
110
+ Copyright (c) 2022, Max Bain
111
+ All rights reserved.
112
+ Modified by Mobius Labs GmbH
113
+ """
114
+
115
+ def __init__(
116
+ self,
117
+ model,
118
+ use_vad_model: bool = True,
119
+ options: Optional[NamedTuple] = None,
120
+ tokenizer=None,
121
+ chunk_length: int = 30,
122
+ vad_device: Union[int, str, "torch.device"] = "auto",
123
+ vad_onset: float = 0.500,
124
+ vad_offset: float = 0.363,
125
+ language: Optional[str] = None,
126
+ ):
127
+ self.model: WhisperModel = model
128
+ self.tokenizer = tokenizer
129
+ self.options = options
130
+ self.preset_language = language
131
+ self.use_vad_model = use_vad_model
132
+ self.vad_onset = vad_onset
133
+ self.vad_offset = vad_offset
134
+ self.vad_model_path = os.path.join(get_assets_path(), "pyannote_vad_model.bin")
135
+ if self.use_vad_model:
136
+ self.vad_device = self.get_device(vad_device)
137
+ self.vad_model = self.load_vad_model(
138
+ vad_onset=self.vad_onset, vad_offset=self.vad_offset
139
+ )
140
+ else:
141
+ self.vad_model = None
142
+ self.chunk_length = chunk_length # VAD merging size
143
+ self.last_speech_timestamp = 0.0
144
+
145
+ def get_device(self, device: Union[int, str, "torch.device"]):
146
+ """
147
+ Converts the input device into a torch.device object.
148
+
149
+ The input can be an integer, a string, or a `torch.device` object.
150
+
151
+ The function handles a special case where the input device is "auto".
152
+ When "auto" is specified, the device will default to the
153
+ device of the model (self.model.device). If the model's device is also "auto",
154
+ it selects "cuda" if a CUDA-capable device is available; otherwise, it selects "cpu".
155
+ """
156
+ if isinstance(device, torch.device):
157
+ return device
158
+ elif isinstance(device, str):
159
+ if device == "auto" and self.model.device == "auto":
160
+ device = "cuda" if torch.cuda.is_available() else "cpu"
161
+ elif device == "auto":
162
+ device = self.model.device
163
+ return torch.device(device)
164
+ elif device < 0:
165
+ return torch.device("cpu")
166
+ else:
167
+ return torch.device(f"cuda:{device}")
168
+
169
+ def forward(self, features, segments_metadata, **forward_params):
170
+ encoder_output, outputs = self.model.generate_segment_batched(
171
+ features, self.tokenizer, forward_params
172
+ )
173
+
174
+ segmented_outputs = []
175
+ segment_sizes = []
176
+ for segment_metadata, output in zip(segments_metadata, outputs):
177
+ duration = segment_metadata["end_time"] - segment_metadata["start_time"]
178
+ segment_size = int(duration * self.model.frames_per_second)
179
+ segment_sizes.append(segment_size)
180
+ (
181
+ subsegments,
182
+ seek,
183
+ single_timestamp_ending,
184
+ ) = self.model._split_segments_by_timestamps(
185
+ tokenizer=self.tokenizer,
186
+ tokens=output["tokens"],
187
+ time_offset=segment_metadata["start_time"],
188
+ segment_size=segment_size,
189
+ segment_duration=duration,
190
+ seek=0,
191
+ )
192
+ segmented_outputs.append(
193
+ [
194
+ dict(
195
+ text=self.tokenizer.decode(subsegment["tokens"]),
196
+ avg_logprob=output["avg_logprob"],
197
+ no_speech_prob=output["no_speech_prob"],
198
+ tokens=subsegment["tokens"],
199
+ start=subsegment["start"],
200
+ end=subsegment["end"],
201
+ compression_ratio=get_compression_ratio(
202
+ self.tokenizer.decode(subsegment["tokens"])
203
+ ),
204
+ )
205
+ for subsegment in subsegments
206
+ ]
207
+ )
208
+ if forward_params["word_timestamps"]:
209
+ self.last_speech_timestamp = self.model.add_word_timestamps(
210
+ segmented_outputs,
211
+ self.tokenizer,
212
+ encoder_output,
213
+ segment_sizes,
214
+ forward_params["prepend_punctuations"],
215
+ forward_params["append_punctuations"],
216
+ self.last_speech_timestamp,
217
+ )
218
+
219
+ return segmented_outputs
220
+
221
+ def get_language_and_tokenizer(
222
+ self, audio, task: Optional[str] = None, language: Optional[str] = None
223
+ ):
224
+ all_language_probs = None
225
+ language_probability = 1.0
226
+
227
+ if self.tokenizer is None:
228
+ if not language:
229
+ (
230
+ language,
231
+ language_probability,
232
+ all_language_probs,
233
+ ) = self.model.detect_language(audio)
234
+ task = task or "transcribe"
235
+ self.tokenizer = Tokenizer(
236
+ self.model.hf_tokenizer,
237
+ self.model.model.is_multilingual,
238
+ task=task,
239
+ language=language,
240
+ )
241
+ else:
242
+ if task is not None:
243
+ self.tokenizer.task = self.tokenizer.tokenizer.token_to_id(
244
+ f"<|{task}|>"
245
+ )
246
+
247
+ if language is not None:
248
+ self.tokenizer.language = self.tokenizer.tokenizer.token_to_id(
249
+ f"<|{language}|>"
250
+ )
251
+ self.tokenizer.language_code = language
252
+
253
+ return language, language_probability, task, all_language_probs
254
+
255
+ @staticmethod
256
+ def audio_split(audio, segments, sampling_rate):
257
+ """Returns splitted audio chunks as iterator"""
258
+ audio_segments = []
259
+ segments_metadata = []
260
+ for seg in segments:
261
+ f1 = int(seg["start"] * sampling_rate)
262
+ f2 = int(seg["end"] * sampling_rate)
263
+ seg_metadata = {
264
+ "start_time": seg["start"],
265
+ "end_time": seg["end"],
266
+ "stitched_seg": seg["segments"],
267
+ }
268
+ audio_segments.append(audio[f1:f2])
269
+ segments_metadata.append(seg_metadata)
270
+ return audio_segments, segments_metadata
271
+
272
+ def load_vad_model(self, vad_onset=0.500, vad_offset=0.363):
273
+ vad_model = Model.from_pretrained(self.vad_model_path)
274
+ hyperparameters = {
275
+ "onset": vad_onset,
276
+ "offset": vad_offset,
277
+ "min_duration_on": 0.1,
278
+ "min_duration_off": 0.1,
279
+ }
280
+
281
+ vad_pipeline = VoiceActivitySegmentation(
282
+ segmentation=vad_model, device=torch.device(self.vad_device)
283
+ )
284
+ vad_pipeline.instantiate(hyperparameters)
285
+ return vad_pipeline
286
+
287
+ def transcribe(
288
+ self,
289
+ audio: Union[str, torch.Tensor, np.ndarray],
290
+ vad_segments: Optional[List[dict]] = None,
291
+ batch_size: int = 16,
292
+ language: Optional[str] = None,
293
+ task: str = None,
294
+ log_progress: bool = False,
295
+ beam_size: int = 5,
296
+ best_of: int = 5,
297
+ patience: float = 1,
298
+ length_penalty: float = 1,
299
+ repetition_penalty: float = 1,
300
+ no_repeat_ngram_size: int = 0,
301
+ temperature: Union[float, List[float], Tuple[float, ...]] = [
302
+ 0.0,
303
+ 0.2,
304
+ 0.4,
305
+ 0.6,
306
+ 0.8,
307
+ 1.0,
308
+ ],
309
+ compression_ratio_threshold: Optional[float] = 2.4,
310
+ log_prob_threshold: Optional[float] = -1.0,
311
+ log_prob_low_threshold: Optional[float] = None,
312
+ no_speech_threshold: Optional[float] = 0.6,
313
+ initial_prompt: Optional[Union[str, Iterable[int]]] = None,
314
+ prefix: Optional[str] = None,
315
+ suppress_blank: bool = True,
316
+ suppress_tokens: Optional[List[int]] = [-1],
317
+ prepend_punctuations: str = "\"'“¿([{-",
318
+ append_punctuations: str = "\"'.。,,!!??::”)]}、",
319
+ max_new_tokens: Optional[int] = None,
320
+ hotwords: Optional[str] = None,
321
+ word_timestamps: bool = False,
322
+ without_timestamps: bool = True,
323
+ ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
324
+ """transcribe audio in chunks in batched fashion and return with language info.
325
+
326
+ Arguments:
327
+ audio: audio file as numpy array/path for batched transcription.
328
+ vad_segments: Optionally provide list of dictionaries each containing "start", "end",
329
+ and "segments" keys.
330
+ "start" and "end" keys specify the start and end of the voiced region within
331
+ 30 sec boundary. An additional key "segments" contains all the start
332
+ and end of voiced regions within that 30sec boundary as a list of tuples.
333
+ If no vad_segments specified, it uses internal vad model automatically segment them.
334
+ batch_size: the maximum number of parallel requests to model for decoding.
335
+ language: The language spoken in the audio.
336
+ task: either "transcribe" or "translate".
337
+ log_progress: whether to show progress bar or not.
338
+ beam_size: Beam size to use for decoding.
339
+ best_of: Number of candidates when sampling with non-zero temperature.
340
+ patience: Beam search patience factor.
341
+ length_penalty: Exponential length penalty constant.
342
+ repetition_penalty: Penalty applied to the score of previously generated tokens
343
+ (set > 1 to penalize).
344
+ no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable).
345
+ temperature: Temperature for sampling. It can be a tuple of temperatures,
346
+ which will be successively used upon failures according to either
347
+ `compression_ratio_threshold` or `log_prob_threshold`.
348
+ compression_ratio_threshold: If the gzip compression ratio is above this value,
349
+ treat as failed.
350
+ log_prob_threshold: If the average log probability over sampled tokens is
351
+ below this value, treat as failed.
352
+ log_prob_low_threshold: This parameter alone is sufficient to skip an output text,
353
+ whereas log_prob_threshold also looks for appropriate no_speech_threshold value.
354
+ This value should be less than log_prob_threshold.
355
+ no_speech_threshold: If the no_speech probability is higher than this value AND
356
+ the average log probability over sampled tokens is below `log_prob_threshold`,
357
+ consider the segment as silent.
358
+ initial_prompt: Optional text string or iterable of token ids to provide as a
359
+ prompt for the first window.
360
+ prefix: Optional text to provide as a prefix for the first window.
361
+ suppress_blank: Suppress blank outputs at the beginning of the sampling.
362
+ suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
363
+ of symbols as defined in `tokenizer.non_speech_tokens()`.
364
+ prepend_punctuations: If word_timestamps is True, merge these punctuation symbols
365
+ with the next word
366
+ append_punctuations: If word_timestamps is True, merge these punctuation symbols
367
+ with the previous word
368
+ max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set,
369
+ the maximum will be set by the default max_length.
370
+ hotwords:
371
+ Hotwords/hint phrases to the model. Has no effect if prefix is not None.
372
+ word_timestamps: Extract word-level timestamps using the cross-attention pattern
373
+ and dynamic time warping, and include the timestamps for each word in each segment.
374
+ Set as False.
375
+ without_timestamps: Only sample text tokens.
376
+
377
+ Static params: (Fixed for batched version)
378
+ max_initial_timestamp: The initial timestamp cannot be later than this, set at 0.0.
379
+ multilingual: If True, perform transcription on multilingual videos. Set as False.
380
+ output_language: Valid only if multilingual is set to True.
381
+ Specifies the string representing the output language. One of
382
+ 'en' (English) or 'hybrid' (code-switched transcription). set as None.
383
+ condition_on_previous_text: If True, the previous output of the model is provided
384
+ as a prompt for the next window; disabling may make the text inconsistent across
385
+ windows, but the model becomes less prone to getting stuck in a failure loop,
386
+ such as repetition looping or timestamps going out of sync. Set as False
387
+ prompt_reset_on_temperature: Resets prompt if temperature is above this value.
388
+ Arg has effect only if condition_on_previous_text is True. Set at 0.5
389
+ #TODO: support "hallucination_silence_threshold" when "word_timestamps=True"
390
+ hallucination_silence_threshold: Optional[float]
391
+ When word_timestamps is True, skip silent periods longer than this threshold
392
+ (in seconds) when a possible hallucination is detected. set as None.
393
+ clip_timestamps:
394
+ Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
395
+ process. The last end timestamp defaults to the end of the file. Set as "0".
396
+
397
+ unused:
398
+ language_detection_threshold: If the maximum probability of the language tokens is
399
+ higher than this value, the language is detected.
400
+ language_detection_segments: Number of segments to consider for the language detection.
401
+ vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
402
+ without speech. This step is using the Silero VAD model
403
+ https://github.com/snakers4/silero-vad.
404
+ vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
405
+ parameters and default values in the class `VadOptions`).
406
+ chunk_length: The length of audio segments. If it is not None, it will overwrite the
407
+ default chunk_length of the FeatureExtractor.
408
+
409
+
410
+ Returns:
411
+ A tuple with:
412
+
413
+ - a generator over transcribed batched segments.
414
+ - an instance of TranscriptionInfo.
415
+ """
416
+
417
+ sampling_rate = self.model.feature_extractor.sampling_rate
418
+
419
+ if isinstance(audio, np.ndarray):
420
+ audio = torch.from_numpy(audio)
421
+ elif not isinstance(audio, torch.Tensor):
422
+ audio = decode_audio(audio, sampling_rate=sampling_rate)
423
+ duration = audio.shape[0] / sampling_rate
424
+
425
+ # if no segment split is provided, use vad_model and generate segments
426
+ if not vad_segments:
427
+ # run the audio if it is less than 30 sec even without vad_segments
428
+ if self.use_vad_model:
429
+ vad_segments = self.vad_model(
430
+ {
431
+ "waveform": audio.unsqueeze(0),
432
+ "sample_rate": 16000,
433
+ }
434
+ )
435
+ vad_segments = merge_chunks(
436
+ vad_segments,
437
+ self.chunk_length,
438
+ onset=self.vad_onset,
439
+ offset=self.vad_offset,
440
+ )
441
+ elif duration < self.chunk_length:
442
+ vad_segments = [
443
+ {"start": 0.0, "end": duration, "segments": [(0.0, duration)]}
444
+ ]
445
+ else:
446
+ raise RuntimeError(
447
+ "No vad segments found. Set 'use_vad_model' to True while loading the model"
448
+ )
449
+ if self.model.model.is_multilingual:
450
+ language = language or self.preset_language
451
+ elif language != "en":
452
+ if language is not None:
453
+ self.model.logger.warning(
454
+ f"English-only model is used, but {language} language is"
455
+ "chosen, setting language to 'en'."
456
+ )
457
+ language = "en"
458
+
459
+ (
460
+ language,
461
+ language_probability,
462
+ task,
463
+ all_language_probs,
464
+ ) = self.get_language_and_tokenizer(audio, task, language)
465
+
466
+ duration_after_vad = sum(
467
+ segment["end"] - segment["start"] for segment in vad_segments
468
+ )
469
+
470
+ # batched options: see the difference with default options in WhisperModel
471
+ batched_options = TranscriptionOptions(
472
+ beam_size=beam_size,
473
+ best_of=best_of,
474
+ patience=patience,
475
+ length_penalty=length_penalty,
476
+ repetition_penalty=repetition_penalty,
477
+ no_repeat_ngram_size=no_repeat_ngram_size,
478
+ log_prob_threshold=log_prob_threshold,
479
+ log_prob_low_threshold=log_prob_low_threshold,
480
+ no_speech_threshold=no_speech_threshold,
481
+ compression_ratio_threshold=compression_ratio_threshold,
482
+ temperatures=(
483
+ temperature if isinstance(temperature, (list, tuple)) else [temperature]
484
+ ),
485
+ initial_prompt=initial_prompt,
486
+ prefix=prefix,
487
+ suppress_blank=suppress_blank,
488
+ suppress_tokens=get_suppressed_tokens(self.tokenizer, suppress_tokens),
489
+ prepend_punctuations=prepend_punctuations,
490
+ append_punctuations=append_punctuations,
491
+ max_new_tokens=max_new_tokens,
492
+ hotwords=hotwords,
493
+ word_timestamps=word_timestamps,
494
+ hallucination_silence_threshold=None,
495
+ condition_on_previous_text=False,
496
+ clip_timestamps="0",
497
+ prompt_reset_on_temperature=0.5,
498
+ multilingual=False,
499
+ output_language=None,
500
+ without_timestamps=without_timestamps,
501
+ max_initial_timestamp=0.0,
502
+ )
503
+
504
+ info = TranscriptionInfo(
505
+ language=language,
506
+ language_probability=language_probability,
507
+ duration=duration,
508
+ duration_after_vad=duration_after_vad,
509
+ transcription_options=batched_options,
510
+ vad_options=None,
511
+ all_language_probs=all_language_probs,
512
+ )
513
+
514
+ audio_segments, segments_metadata = self.audio_split(
515
+ audio, vad_segments, sampling_rate
516
+ )
517
+ to_cpu = (
518
+ self.model.model.device == "cuda" and len(self.model.model.device_index) > 1
519
+ )
520
+ audio_segments = torch.nested.nested_tensor(audio_segments).to_padded_tensor(
521
+ padding=0
522
+ )
523
+ features = torch.stack(
524
+ [
525
+ self.model.feature_extractor(audio_segment, to_cpu=to_cpu)[
526
+ ..., : self.model.feature_extractor.nb_max_frames
527
+ ]
528
+ for audio_segment in audio_segments
529
+ ]
530
+ )
531
+
532
+ segments = self._batched_segments_generator(
533
+ features,
534
+ segments_metadata,
535
+ batch_size,
536
+ batched_options,
537
+ log_progress,
538
+ )
539
+
540
+ return segments, info
541
+
542
+ def _batched_segments_generator(
543
+ self, features, segments_metadata, batch_size, options, log_progress
544
+ ):
545
+ pbar = tqdm(total=len(features), disable=not log_progress, position=0)
546
+ seg_idx = 0
547
+ for i in range(0, len(features), batch_size):
548
+ results = self.forward(
549
+ features[i : i + batch_size],
550
+ segments_metadata[i : i + batch_size],
551
+ **options._asdict(),
552
+ )
553
+
554
+ for result in results:
555
+ for segment in result:
556
+ seg_idx += 1
557
+ yield Segment(
558
+ seek=int(result[-1]["end"] * self.model.frames_per_second),
559
+ id=seg_idx,
560
+ text=segment["text"],
561
+ start=round(segment["start"], 3),
562
+ end=round(segment["end"], 3),
563
+ words=(
564
+ None
565
+ if not options.word_timestamps
566
+ else [Word(**word) for word in segment["words"]]
567
+ ),
568
+ tokens=segment["tokens"],
569
+ avg_logprob=segment["avg_logprob"],
570
+ no_speech_prob=segment["no_speech_prob"],
571
+ compression_ratio=segment["compression_ratio"],
572
+ )
573
+
574
+ pbar.update(1)
575
+
576
+ pbar.close()
577
+ # revert the tokenizer if multilingual inference is enabled
578
+ if self.preset_language is None:
579
+ self.tokenizer = None
580
+ self.last_speech_timestamp = 0.0
581
+
582
+
583
+ class WhisperModel:
584
+ def __init__(
585
+ self,
586
+ model_size_or_path: str,
587
+ device: str = "auto",
588
+ device_index: Union[int, List[int]] = 0,
589
+ compute_type: str = "default",
590
+ cpu_threads: int = 16,
591
+ num_workers: int = 1,
592
+ download_root: Optional[str] = None,
593
+ local_files_only: bool = False,
594
+ files: dict = None,
595
+ **model_kwargs,
596
+ ):
597
+ """Initializes the Whisper model.
598
+
599
+ Args:
600
+ model_size_or_path: Size of the model to use (tiny, tiny.en, base, base.en,
601
+ small, small.en, distil-small.en, medium, medium.en, distil-medium.en, large-v1,
602
+ large-v2, large-v3, large, distil-large-v2 or distil-large-v3), a path to a
603
+ converted model directory, or a CTranslate2-converted Whisper model ID from the HF Hub.
604
+ When a size or a model ID is configured, the converted model is downloaded
605
+ from the Hugging Face Hub.
606
+ device: Device to use for computation ("cpu", "cuda", "auto").
607
+ device_index: Device ID to use.
608
+ The model can also be loaded on multiple GPUs by passing a list of IDs
609
+ (e.g. [0, 1, 2, 3]). In that case, multiple transcriptions can run in parallel
610
+ when transcribe() is called from multiple Python threads (see also num_workers).
611
+ compute_type: Type to use for computation.
612
+ See https://opennmt.net/CTranslate2/quantization.html.
613
+ cpu_threads: Number of threads to use when running on CPU (4 by default).
614
+ A non zero value overrides the OMP_NUM_THREADS environment variable.
615
+ num_workers: When transcribe() is called from multiple Python threads,
616
+ having multiple workers enables true parallelism when running the model
617
+ (concurrent calls to self.model.generate() will run in parallel).
618
+ This can improve the global throughput at the cost of increased memory usage.
619
+ download_root: Directory where the models should be saved. If not set, the models
620
+ are saved in the standard Hugging Face cache directory.
621
+ local_files_only: If True, avoid downloading the file and return the path to the
622
+ local cached file if it exists.
623
+ files: Load model files from the memory. This argument is a dictionary mapping file names
624
+ to file contents as file-like or bytes objects. If this is set, model_path acts as an
625
+ identifier for this model.
626
+ """
627
+ self.logger = get_logger()
628
+
629
+ tokenizer_bytes, preprocessor_bytes = None, None
630
+ if files:
631
+ model_path = model_size_or_path
632
+ tokenizer_bytes = files.pop("tokenizer.json", None)
633
+ preprocessor_bytes = files.pop("preprocessor_config.json", None)
634
+ elif os.path.isdir(model_size_or_path):
635
+ model_path = model_size_or_path
636
+ else:
637
+ model_path = download_model(
638
+ model_size_or_path,
639
+ local_files_only=local_files_only,
640
+ cache_dir=download_root,
641
+ )
642
+ self.device = device
643
+ # set the random seed to make sure consistency across runs
644
+ ctranslate2.set_random_seed(42)
645
+ self.model = ctranslate2.models.Whisper(
646
+ model_path,
647
+ device=self.device,
648
+ device_index=device_index,
649
+ compute_type=compute_type,
650
+ intra_threads=cpu_threads,
651
+ inter_threads=num_workers,
652
+ files=files,
653
+ **model_kwargs,
654
+ )
655
+
656
+ tokenizer_file = os.path.join(model_path, "tokenizer.json")
657
+ if tokenizer_bytes:
658
+ self.hf_tokenizer = tokenizers.Tokenizer.from_buffer(tokenizer_bytes)
659
+ elif os.path.isfile(tokenizer_file):
660
+ self.hf_tokenizer = tokenizers.Tokenizer.from_file(tokenizer_file)
661
+ else:
662
+ self.hf_tokenizer = tokenizers.Tokenizer.from_pretrained(
663
+ "openai/whisper-tiny" + ("" if self.model.is_multilingual else ".en")
664
+ )
665
+ self.feat_kwargs = self._get_feature_kwargs(model_path, preprocessor_bytes)
666
+ self.feature_extractor = FeatureExtractor(
667
+ **self.feat_kwargs, device=self.device
668
+ )
669
+ self.input_stride = 2
670
+ self.num_samples_per_token = (
671
+ self.feature_extractor.hop_length * self.input_stride
672
+ )
673
+ self.frames_per_second = (
674
+ self.feature_extractor.sampling_rate // self.feature_extractor.hop_length
675
+ )
676
+ self.tokens_per_second = (
677
+ self.feature_extractor.sampling_rate // self.num_samples_per_token
678
+ )
679
+ self.time_precision = 0.02
680
+ self.max_length = 448
681
+
682
+ @property
683
+ def supported_languages(self) -> List[str]:
684
+ """The languages supported by the model."""
685
+ return list(_LANGUAGE_CODES) if self.model.is_multilingual else ["en"]
686
+
687
+ def _get_feature_kwargs(self, model_path, preprocessor_bytes=None) -> dict:
688
+ config = {}
689
+ try:
690
+ config_path = os.path.join(model_path, "preprocessor_config.json")
691
+ if preprocessor_bytes:
692
+ config = json.loads(preprocessor_bytes)
693
+ elif os.path.isfile(config_path):
694
+ with open(config_path, "r", encoding="utf-8") as file:
695
+ config = json.load(file)
696
+ else:
697
+ return config
698
+ valid_keys = signature(FeatureExtractor.__init__).parameters.keys()
699
+ return {k: v for k, v in config.items() if k in valid_keys}
700
+ except json.JSONDecodeError as e:
701
+ self.logger.warning("Could not load preprocessor config: %s", e)
702
+
703
+ return config
704
+
705
+ def transcribe(
706
+ self,
707
+ audio: Union[str, BinaryIO, torch.Tensor, np.ndarray],
708
+ language: Optional[str] = None,
709
+ task: str = "transcribe",
710
+ beam_size: int = 5,
711
+ best_of: int = 5,
712
+ patience: float = 1,
713
+ length_penalty: float = 1,
714
+ repetition_penalty: float = 1,
715
+ no_repeat_ngram_size: int = 0,
716
+ temperature: Union[float, List[float], Tuple[float, ...]] = [
717
+ 0.0,
718
+ 0.2,
719
+ 0.4,
720
+ 0.6,
721
+ 0.8,
722
+ 1.0,
723
+ ],
724
+ compression_ratio_threshold: Optional[float] = 2.4,
725
+ log_prob_threshold: Optional[float] = -1.0,
726
+ log_prob_low_threshold: Optional[float] = None,
727
+ no_speech_threshold: Optional[float] = 0.6,
728
+ condition_on_previous_text: bool = True,
729
+ prompt_reset_on_temperature: float = 0.5,
730
+ initial_prompt: Optional[Union[str, Iterable[int]]] = None,
731
+ prefix: Optional[str] = None,
732
+ suppress_blank: bool = True,
733
+ suppress_tokens: Optional[List[int]] = [-1],
734
+ without_timestamps: bool = False,
735
+ max_initial_timestamp: float = 1.0,
736
+ word_timestamps: bool = False,
737
+ prepend_punctuations: str = "\"'“¿([{-",
738
+ append_punctuations: str = "\"'.。,,!!??::”)]}、",
739
+ multilingual: bool = False,
740
+ output_language: Optional[str] = None,
741
+ vad_filter: bool = False,
742
+ vad_parameters: Optional[Union[dict, VadOptions]] = None,
743
+ max_new_tokens: Optional[int] = None,
744
+ chunk_length: Optional[int] = None,
745
+ clip_timestamps: Union[str, List[float]] = "0",
746
+ hallucination_silence_threshold: Optional[float] = None,
747
+ hotwords: Optional[str] = None,
748
+ language_detection_threshold: Optional[float] = None,
749
+ language_detection_segments: int = 1,
750
+ ) -> Tuple[Iterable[Segment], TranscriptionInfo]:
751
+ """Transcribes an input file.
752
+
753
+ Arguments:
754
+ audio: Path to the input file (or a file-like object), or the audio waveform.
755
+ language: The language spoken in the audio. It should be a language code such
756
+ as "en" or "fr". If not set, the language will be detected in the first 30 seconds
757
+ of audio.
758
+ task: Task to execute (transcribe or translate).
759
+ beam_size: Beam size to use for decoding.
760
+ best_of: Number of candidates when sampling with non-zero temperature.
761
+ patience: Beam search patience factor.
762
+ length_penalty: Exponential length penalty constant.
763
+ repetition_penalty: Penalty applied to the score of previously generated tokens
764
+ (set > 1 to penalize).
765
+ no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable).
766
+ temperature: Temperature for sampling. It can be a tuple of temperatures,
767
+ which will be successively used upon failures according to either
768
+ `compression_ratio_threshold` or `log_prob_threshold`.
769
+ compression_ratio_threshold: If the gzip compression ratio is above this value,
770
+ treat as failed.
771
+ log_prob_threshold: If the average log probability over sampled tokens is
772
+ below this value, treat as failed.
773
+ log_prob_low_threshold: This parameter alone is sufficient to skip an output text,
774
+ wheras log_prob_threshold also looks for appropriate no_speech_threshold value.
775
+ This value should be less than log_prob_threshold.
776
+ no_speech_threshold: If the no_speech probability is higher than this value AND
777
+ the average log probability over sampled tokens is below `log_prob_threshold`,
778
+ consider the segment as silent.
779
+ condition_on_previous_text: If True, the previous output of the model is provided
780
+ as a prompt for the next window; disabling may make the text inconsistent across
781
+ windows, but the model becomes less prone to getting stuck in a failure loop,
782
+ such as repetition looping or timestamps going out of sync.
783
+ prompt_reset_on_temperature: Resets prompt if temperature is above this value.
784
+ Arg has effect only if condition_on_previous_text is True.
785
+ initial_prompt: Optional text string or iterable of token ids to provide as a
786
+ prompt for the first window.
787
+ prefix: Optional text to provide as a prefix for the first window.
788
+ suppress_blank: Suppress blank outputs at the beginning of the sampling.
789
+ suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
790
+ of symbols as defined in `tokenizer.non_speech_tokens()`.
791
+ without_timestamps: Only sample text tokens.
792
+ max_initial_timestamp: The initial timestamp cannot be later than this.
793
+ word_timestamps: Extract word-level timestamps using the cross-attention pattern
794
+ and dynamic time warping, and include the timestamps for each word in each segment.
795
+ prepend_punctuations: If word_timestamps is True, merge these punctuation symbols
796
+ with the next word
797
+ append_punctuations: If word_timestamps is True, merge these punctuation symbols
798
+ with the previous word
799
+ multilingual: If True, perform transcription on multilingual videos
800
+ and return the transcript based
801
+ on the 'output_language' flag.
802
+ output_language: Valid only if multilingual is set to True.
803
+ Specifies the string representing the output language. One of
804
+ 'en' (English) or 'hybrid' (code-switched transcription).
805
+ vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
806
+ without speech. This step is using the Silero VAD model
807
+ https://github.com/snakers4/silero-vad.
808
+ vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
809
+ parameters and default values in the class `VadOptions`).
810
+ max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set,
811
+ the maximum will be set by the default max_length.
812
+ chunk_length: The length of audio segments. If it is not None, it will overwrite the
813
+ default chunk_length of the FeatureExtractor.
814
+ clip_timestamps:
815
+ Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
816
+ process. The last end timestamp defaults to the end of the file.
817
+ vad_filter will be ignored if clip_timestamps is used.
818
+ hallucination_silence_threshold:
819
+ When word_timestamps is True, skip silent periods longer than this threshold
820
+ (in seconds) when a possible hallucination is detected
821
+ hotwords:
822
+ Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
823
+ language_detection_threshold: If the maximum probability of the language tokens is higher
824
+ than this value, the language is detected.
825
+ language_detection_segments: Number of segments to consider for the language detection.
826
+ Returns:
827
+ A tuple with:
828
+
829
+ - a generator over transcribed segments
830
+ - an instance of TranscriptionInfo
831
+ """
832
+
833
+ sampling_rate = self.feature_extractor.sampling_rate
834
+
835
+ if isinstance(audio, np.ndarray):
836
+ audio = torch.from_numpy(audio)
837
+ elif not isinstance(audio, torch.Tensor):
838
+ audio = decode_audio(audio, sampling_rate=sampling_rate)
839
+
840
+ duration = audio.shape[0] / sampling_rate
841
+ duration_after_vad = duration
842
+
843
+ self.logger.info(
844
+ "Processing audio with duration %s", format_timestamp(duration)
845
+ )
846
+
847
+ if vad_filter and clip_timestamps == "0":
848
+ if vad_parameters is None:
849
+ vad_parameters = VadOptions()
850
+ elif isinstance(vad_parameters, dict):
851
+ vad_parameters = VadOptions(**vad_parameters)
852
+ speech_chunks = get_speech_timestamps(audio, vad_parameters)
853
+ audio = collect_chunks(audio, speech_chunks)
854
+ duration_after_vad = audio.shape[0] / sampling_rate
855
+
856
+ self.logger.info(
857
+ "VAD filter removed %s of audio",
858
+ format_timestamp(duration - duration_after_vad),
859
+ )
860
+
861
+ if self.logger.isEnabledFor(logging.DEBUG):
862
+ self.logger.debug(
863
+ "VAD filter kept the following audio segments: %s",
864
+ ", ".join(
865
+ "[%s -> %s]"
866
+ % (
867
+ format_timestamp(chunk["start"] / sampling_rate),
868
+ format_timestamp(chunk["end"] / sampling_rate),
869
+ )
870
+ for chunk in speech_chunks
871
+ ),
872
+ )
873
+
874
+ else:
875
+ speech_chunks = None
876
+
877
+ to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
878
+ features = self.feature_extractor(
879
+ audio, chunk_length=chunk_length, to_cpu=to_cpu
880
+ )
881
+
882
+ encoder_output = None
883
+ all_language_probs = None
884
+
885
+ # setting output_language for multilingual videos
886
+ if multilingual:
887
+ if output_language is None:
888
+ output_language = "en"
889
+ elif output_language not in ["en", "hybrid"]:
890
+ raise ValueError("Output language needs to be one of 'en'/'hybrid'.")
891
+
892
+ # detecting the language if not provided
893
+ if language is None:
894
+ if not self.model.is_multilingual:
895
+ language = "en"
896
+ language_probability = 1
897
+ else:
898
+ if (
899
+ language_detection_segments is None
900
+ or language_detection_segments < 1
901
+ ):
902
+ language_detection_segments = 1
903
+ start_timestamp = (
904
+ float(clip_timestamps.split(",")[0])
905
+ if isinstance(clip_timestamps, str)
906
+ else clip_timestamps[0]
907
+ )
908
+ content_frames = (
909
+ features.shape[-1] - self.feature_extractor.nb_max_frames
910
+ )
911
+ seek = (
912
+ int(start_timestamp * self.frames_per_second)
913
+ if start_timestamp * self.frames_per_second < content_frames
914
+ else 0
915
+ )
916
+ end_frames = min(
917
+ seek
918
+ + self.feature_extractor.nb_max_frames
919
+ * language_detection_segments,
920
+ content_frames,
921
+ )
922
+ detected_language_info = {}
923
+ while seek <= end_frames:
924
+ segment = features[
925
+ :, seek : seek + self.feature_extractor.nb_max_frames
926
+ ]
927
+ encoder_output = self.encode(segment)
928
+ # results is a list of tuple[str, float] with language names and
929
+ # probabilities.
930
+ results = self.model.detect_language(encoder_output)[0]
931
+ # Parse language names to strip out markers
932
+ all_language_probs = [
933
+ (token[2:-2], prob) for (token, prob) in results
934
+ ]
935
+ # Get top language token and probability
936
+ language, language_probability = all_language_probs[0]
937
+ if (
938
+ language_detection_threshold is None
939
+ or language_probability > language_detection_threshold
940
+ ):
941
+ break
942
+ detected_language_info.setdefault(language, []).append(
943
+ language_probability
944
+ )
945
+ seek += segment.shape[-1]
946
+ else:
947
+ # If no language detected for all segments, the majority vote of the highest
948
+ # projected languages for all segments is used to determine the language.
949
+ language = max(
950
+ detected_language_info,
951
+ key=lambda lang: len(detected_language_info[lang]),
952
+ )
953
+ language_probability = max(detected_language_info[language])
954
+
955
+ self.logger.info(
956
+ "Detected language '%s' with probability %.2f",
957
+ language,
958
+ language_probability,
959
+ )
960
+ else:
961
+ if not self.model.is_multilingual and language != "en":
962
+ self.logger.warning(
963
+ "The current model is English-only but the language parameter is set to '%s'; "
964
+ "using 'en' instead." % language
965
+ )
966
+ language = "en"
967
+
968
+ language_probability = 1
969
+
970
+ tokenizer = Tokenizer(
971
+ self.hf_tokenizer,
972
+ self.model.is_multilingual,
973
+ task=task,
974
+ language=language,
975
+ )
976
+
977
+ options = TranscriptionOptions(
978
+ beam_size=beam_size,
979
+ best_of=best_of,
980
+ patience=patience,
981
+ length_penalty=length_penalty,
982
+ repetition_penalty=repetition_penalty,
983
+ no_repeat_ngram_size=no_repeat_ngram_size,
984
+ log_prob_threshold=log_prob_threshold,
985
+ log_prob_low_threshold=log_prob_low_threshold,
986
+ no_speech_threshold=no_speech_threshold,
987
+ compression_ratio_threshold=compression_ratio_threshold,
988
+ condition_on_previous_text=condition_on_previous_text,
989
+ prompt_reset_on_temperature=prompt_reset_on_temperature,
990
+ temperatures=(
991
+ temperature if isinstance(temperature, (list, tuple)) else [temperature]
992
+ ),
993
+ initial_prompt=initial_prompt,
994
+ prefix=prefix,
995
+ suppress_blank=suppress_blank,
996
+ suppress_tokens=(
997
+ get_suppressed_tokens(tokenizer, suppress_tokens)
998
+ if suppress_tokens
999
+ else suppress_tokens
1000
+ ),
1001
+ without_timestamps=without_timestamps,
1002
+ max_initial_timestamp=max_initial_timestamp,
1003
+ word_timestamps=word_timestamps,
1004
+ prepend_punctuations=prepend_punctuations,
1005
+ append_punctuations=append_punctuations,
1006
+ multilingual=multilingual,
1007
+ output_language=output_language,
1008
+ max_new_tokens=max_new_tokens,
1009
+ clip_timestamps=clip_timestamps,
1010
+ hallucination_silence_threshold=hallucination_silence_threshold,
1011
+ hotwords=hotwords,
1012
+ )
1013
+
1014
+ segments = self.generate_segments(features, tokenizer, options, encoder_output)
1015
+
1016
+ if speech_chunks:
1017
+ segments = restore_speech_timestamps(segments, speech_chunks, sampling_rate)
1018
+
1019
+ info = TranscriptionInfo(
1020
+ language=language,
1021
+ language_probability=language_probability,
1022
+ duration=duration,
1023
+ duration_after_vad=duration_after_vad,
1024
+ transcription_options=options,
1025
+ vad_options=vad_parameters,
1026
+ all_language_probs=all_language_probs,
1027
+ )
1028
+ return segments, info
1029
+
1030
+ def _split_segments_by_timestamps(
1031
+ self,
1032
+ tokenizer: Tokenizer,
1033
+ tokens: List[int],
1034
+ time_offset: float,
1035
+ segment_size: int,
1036
+ segment_duration: float,
1037
+ seek: int,
1038
+ ) -> List[List[int]]:
1039
+ current_segments = []
1040
+ single_timestamp_ending = (
1041
+ len(tokens) >= 2 and tokens[-2] < tokenizer.timestamp_begin <= tokens[-1]
1042
+ )
1043
+
1044
+ consecutive_timestamps = [
1045
+ i
1046
+ for i in range(len(tokens))
1047
+ if i > 0
1048
+ and tokens[i] >= tokenizer.timestamp_begin
1049
+ and tokens[i - 1] >= tokenizer.timestamp_begin
1050
+ ]
1051
+
1052
+ if len(consecutive_timestamps) > 0:
1053
+ slices = list(consecutive_timestamps)
1054
+ if single_timestamp_ending:
1055
+ slices.append(len(tokens))
1056
+
1057
+ last_slice = 0
1058
+ for current_slice in slices:
1059
+ sliced_tokens = tokens[last_slice:current_slice]
1060
+ start_timestamp_position = sliced_tokens[0] - tokenizer.timestamp_begin
1061
+ end_timestamp_position = sliced_tokens[-1] - tokenizer.timestamp_begin
1062
+ start_time = (
1063
+ time_offset + start_timestamp_position * self.time_precision
1064
+ )
1065
+ end_time = time_offset + end_timestamp_position * self.time_precision
1066
+
1067
+ current_segments.append(
1068
+ dict(
1069
+ seek=seek,
1070
+ start=start_time,
1071
+ end=end_time,
1072
+ tokens=sliced_tokens,
1073
+ )
1074
+ )
1075
+ last_slice = current_slice
1076
+
1077
+ if single_timestamp_ending:
1078
+ # single timestamp at the end means no speech after the last timestamp.
1079
+ seek += segment_size
1080
+ else:
1081
+ # otherwise, ignore the unfinished segment and seek to the last timestamp
1082
+ last_timestamp_position = (
1083
+ tokens[last_slice - 1] - tokenizer.timestamp_begin
1084
+ )
1085
+ seek += last_timestamp_position * self.input_stride
1086
+
1087
+ else:
1088
+ duration = segment_duration
1089
+ timestamps = [
1090
+ token for token in tokens if token >= tokenizer.timestamp_begin
1091
+ ]
1092
+ if len(timestamps) > 0 and timestamps[-1] != tokenizer.timestamp_begin:
1093
+ last_timestamp_position = timestamps[-1] - tokenizer.timestamp_begin
1094
+ duration = last_timestamp_position * self.time_precision
1095
+
1096
+ current_segments.append(
1097
+ dict(
1098
+ seek=seek,
1099
+ start=time_offset,
1100
+ end=time_offset + duration,
1101
+ tokens=tokens,
1102
+ )
1103
+ )
1104
+
1105
+ seek += segment_size
1106
+
1107
+ return current_segments, seek, single_timestamp_ending
1108
+
1109
+ def generate_segments(
1110
+ self,
1111
+ features: torch.Tensor,
1112
+ tokenizer: Tokenizer,
1113
+ options: TranscriptionOptions,
1114
+ encoder_output: Optional[ctranslate2.StorageView] = None,
1115
+ ) -> Iterable[Segment]:
1116
+ content_frames = features.shape[-1] - self.feature_extractor.nb_max_frames
1117
+ content_duration = float(content_frames * self.feature_extractor.time_per_frame)
1118
+
1119
+ if isinstance(options.clip_timestamps, str):
1120
+ options = options._replace(
1121
+ clip_timestamps=[
1122
+ float(ts)
1123
+ for ts in (
1124
+ options.clip_timestamps.split(",")
1125
+ if options.clip_timestamps
1126
+ else []
1127
+ )
1128
+ ]
1129
+ )
1130
+ seek_points: List[int] = [
1131
+ round(ts * self.frames_per_second) for ts in options.clip_timestamps
1132
+ ]
1133
+ if len(seek_points) == 0:
1134
+ seek_points.append(0)
1135
+ if len(seek_points) % 2 == 1:
1136
+ seek_points.append(content_frames)
1137
+ seek_clips: List[Tuple[int, int]] = list(
1138
+ zip(seek_points[::2], seek_points[1::2])
1139
+ )
1140
+
1141
+ punctuation = "\"'“¿([{-\"'.。,,!!??::”)]}、"
1142
+
1143
+ idx = 0
1144
+ clip_idx = 0
1145
+ seek = seek_clips[clip_idx][0]
1146
+ all_tokens = []
1147
+ prompt_reset_since = 0
1148
+
1149
+ if options.initial_prompt is not None:
1150
+ if isinstance(options.initial_prompt, str):
1151
+ initial_prompt = " " + options.initial_prompt.strip()
1152
+ initial_prompt_tokens = tokenizer.encode(initial_prompt)
1153
+ all_tokens.extend(initial_prompt_tokens)
1154
+ else:
1155
+ all_tokens.extend(options.initial_prompt)
1156
+
1157
+ last_speech_timestamp = 0.0
1158
+ # NOTE: This loop is obscurely flattened to make the diff readable.
1159
+ # A later commit should turn this into a simpler nested loop.
1160
+ # for seek_clip_start, seek_clip_end in seek_clips:
1161
+ # while seek < seek_clip_end
1162
+ while clip_idx < len(seek_clips):
1163
+ seek_clip_start, seek_clip_end = seek_clips[clip_idx]
1164
+ if seek_clip_end > content_frames:
1165
+ seek_clip_end = content_frames
1166
+ if seek < seek_clip_start:
1167
+ seek = seek_clip_start
1168
+ if seek >= seek_clip_end:
1169
+ clip_idx += 1
1170
+ if clip_idx < len(seek_clips):
1171
+ seek = seek_clips[clip_idx][0]
1172
+ continue
1173
+ time_offset = seek * self.feature_extractor.time_per_frame
1174
+ window_end_time = float(
1175
+ (seek + self.feature_extractor.nb_max_frames)
1176
+ * self.feature_extractor.time_per_frame
1177
+ )
1178
+ segment_size = min(
1179
+ self.feature_extractor.nb_max_frames,
1180
+ content_frames - seek,
1181
+ seek_clip_end - seek,
1182
+ )
1183
+ segment = features[:, seek : seek + segment_size]
1184
+ segment_duration = segment_size * self.feature_extractor.time_per_frame
1185
+ segment = pad_or_trim(segment, self.feature_extractor.nb_max_frames)
1186
+
1187
+ if self.logger.isEnabledFor(logging.DEBUG):
1188
+ self.logger.debug(
1189
+ "Processing segment at %s", format_timestamp(time_offset)
1190
+ )
1191
+
1192
+ previous_tokens = all_tokens[prompt_reset_since:]
1193
+
1194
+ if encoder_output is None:
1195
+ encoder_output = self.encode(segment)
1196
+
1197
+ # Perform language detection at every segment to update task based on output language,
1198
+ # if the language is english, task is transcribe,
1199
+ # else the task is translate to english (default)
1200
+ # or transcribe if 'output_language' is 'hybrid'.
1201
+ if options.multilingual:
1202
+ results = self.model.detect_language(encoder_output)
1203
+ language_token, language_probability = results[0][0]
1204
+ language = language_token[2:-2]
1205
+ if options.output_language == "en" and language != "en":
1206
+ task = "translate"
1207
+ else:
1208
+ task = "transcribe"
1209
+
1210
+ # Update tokenizer based on task and language
1211
+ tokenizer.task = tokenizer.tokenizer.token_to_id(f"<|{task}|>")
1212
+ tokenizer.language = tokenizer.tokenizer.token_to_id(language_token)
1213
+ tokenizer.language_code = language
1214
+ # Update prompt based on task and language
1215
+ prompt = self.get_prompt(
1216
+ tokenizer,
1217
+ previous_tokens,
1218
+ without_timestamps=options.without_timestamps,
1219
+ prefix=options.prefix if seek == 0 else None,
1220
+ hotwords=options.hotwords,
1221
+ )
1222
+
1223
+ if seek > 0 or encoder_output is None:
1224
+ encoder_output = self.encode(segment)
1225
+
1226
+ (
1227
+ result,
1228
+ avg_logprob,
1229
+ temperature,
1230
+ compression_ratio,
1231
+ ) = self.generate_with_fallback(encoder_output, prompt, tokenizer, options)
1232
+
1233
+ if options.no_speech_threshold is not None:
1234
+ # no voice activity check
1235
+ should_skip = result.no_speech_prob > options.no_speech_threshold
1236
+
1237
+ if (
1238
+ options.log_prob_threshold is not None
1239
+ and avg_logprob > options.log_prob_threshold
1240
+ ):
1241
+ # don't skip if the logprob is high enough, despite the no_speech_prob
1242
+ should_skip = False
1243
+
1244
+ if should_skip:
1245
+ self.logger.debug(
1246
+ "No speech threshold is met (%f > %f)",
1247
+ result.no_speech_prob,
1248
+ options.no_speech_threshold,
1249
+ )
1250
+
1251
+ # Skip if the logprob is very low (below the threshold value),
1252
+ # despite no_speech_prob being low (ex: Too ambiguous outputs)
1253
+ if options.log_prob_low_threshold:
1254
+ if avg_logprob < options.log_prob_low_threshold:
1255
+ should_skip = True
1256
+ self.logger.debug(
1257
+ "log prob low threshold is met (%f > %f)",
1258
+ avg_logprob,
1259
+ options.log_prob_low_threshold,
1260
+ )
1261
+
1262
+ if should_skip:
1263
+ # fast-forward to the next segment boundary
1264
+ seek += segment_size
1265
+ continue
1266
+
1267
+ tokens = result.sequences_ids[0]
1268
+
1269
+ previous_seek = seek
1270
+
1271
+ # anomalous words are very long/short/improbable
1272
+ def word_anomaly_score(word: dict) -> float:
1273
+ probability = word.get("probability", 0.0)
1274
+ duration = word["end"] - word["start"]
1275
+ score = 0.0
1276
+ if probability < 0.15:
1277
+ score += 1.0
1278
+ if duration < 0.133:
1279
+ score += (0.133 - duration) * 15
1280
+ if duration > 2.0:
1281
+ score += duration - 2.0
1282
+ return score
1283
+
1284
+ def is_segment_anomaly(segment: Optional[dict]) -> bool:
1285
+ if segment is None or not segment["words"]:
1286
+ return False
1287
+ words = [w for w in segment["words"] if w["word"] not in punctuation]
1288
+ words = words[:8]
1289
+ score = sum(word_anomaly_score(w) for w in words)
1290
+ return score >= 3 or score + 0.01 >= len(words)
1291
+
1292
+ def next_words_segment(segments: List[dict]) -> Optional[dict]:
1293
+ return next((s for s in segments if s["words"]), None)
1294
+
1295
+ (
1296
+ current_segments,
1297
+ seek,
1298
+ single_timestamp_ending,
1299
+ ) = self._split_segments_by_timestamps(
1300
+ tokenizer=tokenizer,
1301
+ tokens=tokens,
1302
+ time_offset=time_offset,
1303
+ segment_size=segment_size,
1304
+ segment_duration=segment_duration,
1305
+ seek=seek,
1306
+ )
1307
+
1308
+ if options.word_timestamps:
1309
+ self.add_word_timestamps(
1310
+ [current_segments],
1311
+ tokenizer,
1312
+ encoder_output,
1313
+ segment_size,
1314
+ options.prepend_punctuations,
1315
+ options.append_punctuations,
1316
+ last_speech_timestamp=last_speech_timestamp,
1317
+ )
1318
+ if not single_timestamp_ending:
1319
+ last_word_end = get_end(current_segments)
1320
+ if last_word_end is not None and last_word_end > time_offset:
1321
+ seek = round(last_word_end * self.frames_per_second)
1322
+
1323
+ # skip silence before possible hallucinations
1324
+ if options.hallucination_silence_threshold is not None:
1325
+ threshold = options.hallucination_silence_threshold
1326
+
1327
+ # if first segment might be a hallucination, skip leading silence
1328
+ first_segment = next_words_segment(current_segments)
1329
+ if first_segment is not None and is_segment_anomaly(first_segment):
1330
+ gap = first_segment["start"] - time_offset
1331
+ if gap > threshold:
1332
+ seek = previous_seek + round(gap * self.frames_per_second)
1333
+ continue
1334
+
1335
+ # skip silence before any possible hallucination that is surrounded
1336
+ # by silence or more hallucinations
1337
+ hal_last_end = last_speech_timestamp
1338
+ for si in range(len(current_segments)):
1339
+ segment = current_segments[si]
1340
+ if not segment["words"]:
1341
+ continue
1342
+ if is_segment_anomaly(segment):
1343
+ next_segment = next_words_segment(
1344
+ current_segments[si + 1 :]
1345
+ )
1346
+ if next_segment is not None:
1347
+ hal_next_start = next_segment["words"][0]["start"]
1348
+ else:
1349
+ hal_next_start = time_offset + segment_duration
1350
+ silence_before = (
1351
+ segment["start"] - hal_last_end > threshold
1352
+ or segment["start"] < threshold
1353
+ or segment["start"] - time_offset < 2.0
1354
+ )
1355
+ silence_after = (
1356
+ hal_next_start - segment["end"] > threshold
1357
+ or is_segment_anomaly(next_segment)
1358
+ or window_end_time - segment["end"] < 2.0
1359
+ )
1360
+ if silence_before and silence_after:
1361
+ seek = round(
1362
+ max(time_offset + 1, segment["start"])
1363
+ * self.frames_per_second
1364
+ )
1365
+ if content_duration - segment["end"] < threshold:
1366
+ seek = content_frames
1367
+ current_segments[si:] = []
1368
+ break
1369
+ hal_last_end = segment["end"]
1370
+
1371
+ last_word_end = get_end(current_segments)
1372
+ if last_word_end is not None:
1373
+ last_speech_timestamp = last_word_end
1374
+ for segment in current_segments:
1375
+ tokens = segment["tokens"]
1376
+ text = tokenizer.decode(tokens)
1377
+
1378
+ if segment["start"] == segment["end"] or not text.strip():
1379
+ continue
1380
+
1381
+ all_tokens.extend(tokens)
1382
+ idx += 1
1383
+
1384
+ yield Segment(
1385
+ id=idx,
1386
+ seek=seek,
1387
+ start=segment["start"],
1388
+ end=segment["end"],
1389
+ text=text,
1390
+ tokens=tokens,
1391
+ temperature=temperature,
1392
+ avg_logprob=avg_logprob,
1393
+ compression_ratio=compression_ratio,
1394
+ no_speech_prob=result.no_speech_prob,
1395
+ words=(
1396
+ [Word(**word) for word in segment["words"]]
1397
+ if options.word_timestamps
1398
+ else None
1399
+ ),
1400
+ )
1401
+
1402
+ if (
1403
+ not options.condition_on_previous_text
1404
+ or temperature > options.prompt_reset_on_temperature
1405
+ ):
1406
+ if options.condition_on_previous_text:
1407
+ self.logger.debug(
1408
+ "Reset prompt. prompt_reset_on_temperature threshold is met %f > %f",
1409
+ temperature,
1410
+ options.prompt_reset_on_temperature,
1411
+ )
1412
+
1413
+ prompt_reset_since = len(all_tokens)
1414
+
1415
+ def encode(self, features: torch.Tensor) -> ctranslate2.StorageView:
1416
+ # When the model is running on multiple GPUs, the encoder output should be moved
1417
+ # to the CPU since we don't know which GPU will handle the next job.
1418
+ to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
1419
+
1420
+ if features.ndim == 2:
1421
+ features = features.unsqueeze(0)
1422
+ features = get_ctranslate2_storage(features)
1423
+
1424
+ return self.model.encode(features, to_cpu=to_cpu)
1425
+
1426
+ def generate_with_fallback(
1427
+ self,
1428
+ encoder_output: ctranslate2.StorageView,
1429
+ prompt: List[int],
1430
+ tokenizer: Tokenizer,
1431
+ options: TranscriptionOptions,
1432
+ ) -> Tuple[ctranslate2.models.WhisperGenerationResult, float, float, float]:
1433
+ decode_result = None
1434
+ all_results = []
1435
+ below_cr_threshold_results = []
1436
+
1437
+ max_initial_timestamp_index = int(
1438
+ round(options.max_initial_timestamp / self.time_precision)
1439
+ )
1440
+ if options.max_new_tokens is not None:
1441
+ max_length = len(prompt) + options.max_new_tokens
1442
+ else:
1443
+ max_length = self.max_length
1444
+
1445
+ if max_length > self.max_length:
1446
+ raise ValueError(
1447
+ f"The length of the prompt is {len(prompt)}, and the `max_new_tokens` "
1448
+ f"{max_length - len(prompt)}. Thus, the combined length of the prompt "
1449
+ f"and `max_new_tokens` is: {max_length}. This exceeds the "
1450
+ f"`max_length` of the Whisper model: {self.max_length}. "
1451
+ "You should either reduce the length of your prompt, or "
1452
+ "reduce the value of `max_new_tokens`, "
1453
+ f"so that their combined length is less that {self.max_length}."
1454
+ )
1455
+
1456
+ for temperature in options.temperatures:
1457
+ if temperature > 0:
1458
+ kwargs = {
1459
+ "beam_size": 1,
1460
+ "num_hypotheses": options.best_of,
1461
+ "sampling_topk": 0,
1462
+ "sampling_temperature": temperature,
1463
+ }
1464
+ else:
1465
+ kwargs = {
1466
+ "beam_size": options.beam_size,
1467
+ "patience": options.patience,
1468
+ }
1469
+
1470
+ result = self.model.generate(
1471
+ encoder_output,
1472
+ [prompt],
1473
+ length_penalty=options.length_penalty,
1474
+ repetition_penalty=options.repetition_penalty,
1475
+ no_repeat_ngram_size=options.no_repeat_ngram_size,
1476
+ max_length=max_length,
1477
+ return_scores=True,
1478
+ return_no_speech_prob=True,
1479
+ suppress_blank=options.suppress_blank,
1480
+ suppress_tokens=options.suppress_tokens,
1481
+ max_initial_timestamp_index=max_initial_timestamp_index,
1482
+ **kwargs,
1483
+ )[0]
1484
+
1485
+ tokens = result.sequences_ids[0]
1486
+
1487
+ # Recover the average log prob from the returned score.
1488
+ seq_len = len(tokens)
1489
+ cum_logprob = result.scores[0] * (seq_len**options.length_penalty)
1490
+ avg_logprob = cum_logprob / (seq_len + 1)
1491
+
1492
+ text = tokenizer.decode(tokens).strip()
1493
+ compression_ratio = get_compression_ratio(text)
1494
+
1495
+ decode_result = (
1496
+ result,
1497
+ avg_logprob,
1498
+ temperature,
1499
+ compression_ratio,
1500
+ )
1501
+ all_results.append(decode_result)
1502
+
1503
+ needs_fallback = False
1504
+
1505
+ if options.compression_ratio_threshold is not None:
1506
+ if compression_ratio > options.compression_ratio_threshold:
1507
+ needs_fallback = True # too repetitive
1508
+
1509
+ self.logger.debug(
1510
+ "Compression ratio threshold is not met with temperature %.1f (%f > %f)",
1511
+ temperature,
1512
+ compression_ratio,
1513
+ options.compression_ratio_threshold,
1514
+ )
1515
+ else:
1516
+ below_cr_threshold_results.append(decode_result)
1517
+
1518
+ if (
1519
+ options.log_prob_threshold is not None
1520
+ and avg_logprob < options.log_prob_threshold
1521
+ ):
1522
+ needs_fallback = True # average log probability is too low
1523
+
1524
+ self.logger.debug(
1525
+ "Log probability threshold is not met with temperature %.1f (%f < %f)",
1526
+ temperature,
1527
+ avg_logprob,
1528
+ options.log_prob_threshold,
1529
+ )
1530
+
1531
+ if (
1532
+ options.no_speech_threshold is not None
1533
+ and result.no_speech_prob > options.no_speech_threshold
1534
+ and options.log_prob_threshold is not None
1535
+ and avg_logprob < options.log_prob_threshold
1536
+ ):
1537
+ needs_fallback = False # silence
1538
+
1539
+ if not needs_fallback:
1540
+ break
1541
+ else:
1542
+ # all failed, select the result with the highest average log probability
1543
+ decode_result = max(
1544
+ below_cr_threshold_results or all_results, key=lambda x: x[1]
1545
+ )
1546
+ # to pass final temperature for prompt_reset_on_temperature
1547
+ decode_result = (
1548
+ decode_result[0],
1549
+ decode_result[1],
1550
+ temperature,
1551
+ decode_result[3],
1552
+ )
1553
+
1554
+ return decode_result
1555
+
1556
+ def get_prompt(
1557
+ self,
1558
+ tokenizer: Tokenizer,
1559
+ previous_tokens: List[int],
1560
+ without_timestamps: bool = False,
1561
+ prefix: Optional[str] = None,
1562
+ hotwords: Optional[str] = None,
1563
+ ) -> List[int]:
1564
+ prompt = []
1565
+
1566
+ if previous_tokens or (hotwords and not prefix):
1567
+ prompt.append(tokenizer.sot_prev)
1568
+ if hotwords and not prefix:
1569
+ hotwords_tokens = tokenizer.encode(" " + hotwords.strip())
1570
+ if len(hotwords_tokens) >= self.max_length // 2:
1571
+ hotwords_tokens = hotwords_tokens[: self.max_length // 2 - 1]
1572
+ prompt.extend(hotwords_tokens)
1573
+ if previous_tokens:
1574
+ prompt.extend(previous_tokens[-(self.max_length // 2 - 1) :])
1575
+
1576
+ prompt.extend(tokenizer.sot_sequence)
1577
+
1578
+ if without_timestamps:
1579
+ prompt.append(tokenizer.no_timestamps)
1580
+
1581
+ if prefix:
1582
+ prefix_tokens = tokenizer.encode(" " + prefix.strip())
1583
+ if len(prefix_tokens) >= self.max_length // 2:
1584
+ prefix_tokens = prefix_tokens[: self.max_length // 2 - 1]
1585
+ if not without_timestamps:
1586
+ prompt.append(tokenizer.timestamp_begin)
1587
+ prompt.extend(prefix_tokens)
1588
+
1589
+ return prompt
1590
+
1591
+ def add_word_timestamps(
1592
+ self,
1593
+ segments: List[dict],
1594
+ tokenizer: Tokenizer,
1595
+ encoder_output: ctranslate2.StorageView,
1596
+ num_frames: int,
1597
+ prepend_punctuations: str,
1598
+ append_punctuations: str,
1599
+ last_speech_timestamp: float,
1600
+ ) -> float:
1601
+ if len(segments) == 0:
1602
+ return
1603
+
1604
+ text_tokens = []
1605
+ text_tokens_per_segment = []
1606
+ for segment in segments:
1607
+ segment_tokens = [
1608
+ [token for token in subsegment["tokens"] if token < tokenizer.eot]
1609
+ for subsegment in segment
1610
+ ]
1611
+ text_tokens.append(list(itertools.chain.from_iterable(segment_tokens)))
1612
+ text_tokens_per_segment.append(segment_tokens)
1613
+
1614
+ alignments = self.find_alignment(
1615
+ tokenizer, text_tokens, encoder_output, num_frames
1616
+ )
1617
+ median_max_durations = []
1618
+ for alignment in alignments:
1619
+ word_durations = np.array(
1620
+ [word["end"] - word["start"] for word in alignment]
1621
+ )
1622
+ word_durations = word_durations[word_durations.nonzero()]
1623
+ median_duration = (
1624
+ np.median(word_durations) if len(word_durations) > 0 else 0.0
1625
+ )
1626
+ median_duration = min(0.7, float(median_duration))
1627
+ max_duration = median_duration * 2
1628
+
1629
+ # hack: truncate long words at sentence boundaries.
1630
+ # a better segmentation algorithm based on VAD should be able to replace this.
1631
+ if len(word_durations) > 0:
1632
+ sentence_end_marks = ".。!!??"
1633
+ # ensure words at sentence boundaries
1634
+ # are not longer than twice the median word duration.
1635
+ for i in range(1, len(alignment)):
1636
+ if alignment[i]["end"] - alignment[i]["start"] > max_duration:
1637
+ if alignment[i]["word"] in sentence_end_marks:
1638
+ alignment[i]["end"] = alignment[i]["start"] + max_duration
1639
+ elif alignment[i - 1]["word"] in sentence_end_marks:
1640
+ alignment[i]["start"] = alignment[i]["end"] - max_duration
1641
+
1642
+ merge_punctuations(alignment, prepend_punctuations, append_punctuations)
1643
+ median_max_durations.append((median_duration, max_duration))
1644
+
1645
+ for segment_idx, segment in enumerate(segments):
1646
+ word_index = 0
1647
+ time_offset = segment[0]["start"]
1648
+ median_duration, max_duration = median_max_durations[segment_idx]
1649
+ for subsegment_idx, subsegment in enumerate(segment):
1650
+ saved_tokens = 0
1651
+ words = []
1652
+
1653
+ while word_index < len(alignments[segment_idx]) and saved_tokens < len(
1654
+ text_tokens_per_segment[segment_idx][subsegment_idx]
1655
+ ):
1656
+ timing = alignments[segment_idx][word_index]
1657
+
1658
+ if timing["word"]:
1659
+ words.append(
1660
+ dict(
1661
+ word=timing["word"],
1662
+ start=round(time_offset + timing["start"], 2),
1663
+ end=round(time_offset + timing["end"], 2),
1664
+ probability=timing["probability"],
1665
+ )
1666
+ )
1667
+
1668
+ saved_tokens += len(timing["tokens"])
1669
+ word_index += 1
1670
+
1671
+ # hack: truncate long words at segment boundaries.
1672
+ # a better segmentation algorithm based on VAD should be able to replace this.
1673
+ if len(words) > 0:
1674
+ # ensure the first and second word after a pause is not longer than
1675
+ # twice the median word duration.
1676
+ if words[0][
1677
+ "end"
1678
+ ] - last_speech_timestamp > median_duration * 4 and (
1679
+ words[0]["end"] - words[0]["start"] > max_duration
1680
+ or (
1681
+ len(words) > 1
1682
+ and words[1]["end"] - words[0]["start"] > max_duration * 2
1683
+ )
1684
+ ):
1685
+ if (
1686
+ len(words) > 1
1687
+ and words[1]["end"] - words[1]["start"] > max_duration
1688
+ ):
1689
+ boundary = max(
1690
+ words[1]["end"] / 2, words[1]["end"] - max_duration
1691
+ )
1692
+ words[0]["end"] = words[1]["start"] = boundary
1693
+ words[0]["start"] = max(0, words[0]["end"] - max_duration)
1694
+
1695
+ # prefer the segment-level start timestamp if the first word is too long.
1696
+ if (
1697
+ subsegment["start"] < words[0]["end"]
1698
+ and subsegment["start"] - 0.5 > words[0]["start"]
1699
+ ):
1700
+ words[0]["start"] = max(
1701
+ 0,
1702
+ min(words[0]["end"] - median_duration, subsegment["start"]),
1703
+ )
1704
+ else:
1705
+ subsegment["start"] = words[0]["start"]
1706
+
1707
+ # prefer the segment-level end timestamp if the last word is too long.
1708
+ if (
1709
+ subsegment["end"] > words[-1]["start"]
1710
+ and subsegment["end"] + 0.5 < words[-1]["end"]
1711
+ ):
1712
+ words[-1]["end"] = max(
1713
+ words[-1]["start"] + median_duration, subsegment["end"]
1714
+ )
1715
+ else:
1716
+ subsegment["end"] = words[-1]["end"]
1717
+
1718
+ last_speech_timestamp = subsegment["end"]
1719
+ segments[segment_idx][subsegment_idx]["words"] = words
1720
+ return last_speech_timestamp
1721
+
1722
+ def find_alignment(
1723
+ self,
1724
+ tokenizer: Tokenizer,
1725
+ text_tokens: List[int],
1726
+ encoder_output: ctranslate2.StorageView,
1727
+ num_frames: int,
1728
+ median_filter_width: int = 7,
1729
+ ) -> List[dict]:
1730
+ if len(text_tokens) == 0:
1731
+ return []
1732
+
1733
+ results = self.model.align(
1734
+ encoder_output,
1735
+ tokenizer.sot_sequence,
1736
+ text_tokens,
1737
+ num_frames,
1738
+ median_filter_width=median_filter_width,
1739
+ )
1740
+ return_list = []
1741
+ for result, text_token in zip(results, text_tokens):
1742
+ text_token_probs = result.text_token_probs
1743
+ alignments = result.alignments
1744
+ text_indices = np.array([pair[0] for pair in alignments])
1745
+ time_indices = np.array([pair[1] for pair in alignments])
1746
+
1747
+ words, word_tokens = tokenizer.split_to_word_tokens(
1748
+ text_token + [tokenizer.eot]
1749
+ )
1750
+ if len(word_tokens) <= 1:
1751
+ # return on eot only
1752
+ # >>> np.pad([], (1, 0))
1753
+ # array([0.])
1754
+ # This results in crashes when we lookup jump_times with float, like
1755
+ # IndexError: arrays used as indices must be of integer (or boolean) type
1756
+ return []
1757
+ word_boundaries = np.pad(
1758
+ np.cumsum([len(t) for t in word_tokens[:-1]]), (1, 0)
1759
+ )
1760
+ if len(word_boundaries) <= 1:
1761
+ return []
1762
+
1763
+ jumps = np.pad(np.diff(text_indices), (1, 0), constant_values=1).astype(
1764
+ bool
1765
+ )
1766
+ jump_times = time_indices[jumps] / self.tokens_per_second
1767
+ start_times = jump_times[word_boundaries[:-1]]
1768
+ end_times = jump_times[word_boundaries[1:]]
1769
+ word_probabilities = [
1770
+ np.mean(text_token_probs[i:j])
1771
+ for i, j in zip(word_boundaries[:-1], word_boundaries[1:])
1772
+ ]
1773
+
1774
+ return_list.append(
1775
+ [
1776
+ dict(
1777
+ word=word,
1778
+ tokens=tokens,
1779
+ start=start,
1780
+ end=end,
1781
+ probability=probability,
1782
+ )
1783
+ for word, tokens, start, end, probability in zip(
1784
+ words, word_tokens, start_times, end_times, word_probabilities
1785
+ )
1786
+ ]
1787
+ )
1788
+ return return_list
1789
+
1790
+ def generate_segment_batched(
1791
+ self,
1792
+ features: torch.Tensor,
1793
+ tokenizer: Tokenizer,
1794
+ options: dict,
1795
+ ):
1796
+ batch_size = features.shape[0]
1797
+ all_tokens = []
1798
+ prompt_reset_since = 0
1799
+
1800
+ if options["initial_prompt"] is not None:
1801
+ initial_prompt = " " + options["initial_prompt"].strip()
1802
+ initial_prompt_tokens = tokenizer.encode(initial_prompt)
1803
+ all_tokens.extend(initial_prompt_tokens)
1804
+ previous_tokens = all_tokens[prompt_reset_since:]
1805
+ prompt = self.get_prompt(
1806
+ tokenizer,
1807
+ previous_tokens,
1808
+ without_timestamps=options["without_timestamps"],
1809
+ prefix=options["prefix"],
1810
+ )
1811
+
1812
+ encoder_output = self.encode(features)
1813
+
1814
+ result = self.model.generate(
1815
+ encoder_output,
1816
+ [prompt] * batch_size,
1817
+ beam_size=options["beam_size"],
1818
+ patience=options["patience"],
1819
+ length_penalty=options["length_penalty"],
1820
+ max_length=self.max_length,
1821
+ suppress_blank=options["suppress_blank"],
1822
+ suppress_tokens=options["suppress_tokens"],
1823
+ return_scores=True,
1824
+ return_no_speech_prob=True,
1825
+ )
1826
+
1827
+ output = []
1828
+ for res in result:
1829
+ output.append({})
1830
+ # return scores
1831
+ seq_len = len(res.sequences_ids[0])
1832
+ cum_logprob = res.scores[0] * (seq_len ** options["length_penalty"])
1833
+ output[-1]["avg_logprob"] = cum_logprob / (seq_len + 1)
1834
+
1835
+ # return no speech prob
1836
+ output[-1]["no_speech_prob"] = res.no_speech_prob
1837
+ output[-1]["tokens"] = res.sequences_ids[0]
1838
+
1839
+ return encoder_output, output
1840
+
1841
+ def detect_language(self, audio: torch.Tensor):
1842
+ to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
1843
+ segment = self.feature_extractor(audio, padding=True, to_cpu=to_cpu)[
1844
+ :, : self.feature_extractor.nb_max_frames
1845
+ ]
1846
+ encoder_output = self.encode(segment)
1847
+ results = self.model.detect_language(encoder_output)
1848
+ language_token, language_probability = results[0][0]
1849
+ language = language_token[2:-2]
1850
+ self.logger.info(
1851
+ f"Detected language: {language} ({language_probability:.2f}) in first 30s of audio..."
1852
+ )
1853
+ all_language_probs = [(token[2:-2], prob) for (token, prob) in results[0]]
1854
+ return language, language_probability, all_language_probs
1855
+
1856
+ def detect_language_multi_segment(
1857
+ self, audio: Union[str, BinaryIO, torch.Tensor], params: Optional[dict] = None
1858
+ ):
1859
+ """
1860
+ Detect language based on N highly-confident segments of a language.
1861
+ """
1862
+ # The threshold is used to decide if the audio is silence or not.
1863
+ # The default is 0.02 (2.0%) i.e, if more than 2.0% of the audio is silent,
1864
+ # the audio is considered as silence.
1865
+ if not params:
1866
+ params = {
1867
+ "multilingual": False,
1868
+ "speech_percentage_threshold": 0.02,
1869
+ "language_detection_segments": 4,
1870
+ "vad_filter": True,
1871
+ "vad_min_silence_duration": 2500,
1872
+ "language_threshold": 0.7,
1873
+ }
1874
+
1875
+ if params.get("multilingual", False):
1876
+ logging.warning(
1877
+ "lang_id is not supported for multilingual audios, detecting the major language."
1878
+ )
1879
+
1880
+ speech_percentage_threshold = params.get("speech_percentage_threshold", 0.02)
1881
+ language_threshold = params.get("language_threshold", 0.7)
1882
+ num_detection_segments = params.get("language_detection_segments", 4)
1883
+ vad_filter_enabled = params.get("vad_filter", True)
1884
+ vad_params = dict(
1885
+ min_silence_duration_ms=params.get("vad_min_silence_duration", 2500)
1886
+ )
1887
+
1888
+ if vad_filter_enabled:
1889
+ vad_params = VadOptions(**vad_params)
1890
+
1891
+ # decode audio if it is not decoded already
1892
+ sampling_rate = self.feature_extractor.sampling_rate
1893
+ if not isinstance(audio, torch.Tensor):
1894
+ audio: torch.Tensor = decode_audio(audio, sampling_rate=sampling_rate)
1895
+
1896
+ # calculate duration of audio as number of seconds
1897
+ # audio.shape[0] is the number of samples in the audio
1898
+ # sampling_rate is the number of samples per second
1899
+ # if we divide the number of samples by the number of samples per second,
1900
+ # we get the duration in seconds
1901
+ duration = audio.shape[0] / sampling_rate
1902
+
1903
+ # Check if vad is enabled, and collect voiced segments
1904
+ if vad_filter_enabled:
1905
+ # get chunks of audio that contain speech
1906
+ speech_chunks = get_speech_timestamps(audio, vad_params)
1907
+ # merge chunks of audio that contain speech into a single array
1908
+ audio = collect_chunks(audio, speech_chunks)
1909
+
1910
+ # calculate new duration of audio without silence
1911
+ duration_vad = audio.shape[0] / sampling_rate
1912
+
1913
+ logging.debug(
1914
+ f"Lang ID: VAD filter removed {duration - duration_vad} sec of audio"
1915
+ )
1916
+
1917
+ # if the audio after VAD is less than 2% of the original audio, consider it as silence
1918
+ if duration_vad / duration < speech_percentage_threshold:
1919
+ return {"language_code": None, "language_confidence": 1.0}
1920
+
1921
+ # update duration to be the duration after VAD
1922
+ duration = duration_vad
1923
+
1924
+ # if the duration of the audio is less than 1 second, consider it as silence
1925
+ if duration < 1.0:
1926
+ return {"language_code": None, "language_confidence": 1.0}
1927
+
1928
+ # number of feature frames in 30 seconds of audio is 3000
1929
+ nb_max_frames = self.feature_extractor.nb_max_frames
1930
+
1931
+ # extract features from audio with padding (default)
1932
+ to_cpu = self.model.device == "cuda" and len(self.model.device_index) > 1
1933
+ features = self.feature_extractor(audio, to_cpu=to_cpu)
1934
+
1935
+ # number of segments in the audio
1936
+ num_segments = features.shape[-1] // nb_max_frames
1937
+ # more number of segments than possible with the duration of file
1938
+ if num_detection_segments > num_segments:
1939
+ logging.warning(
1940
+ f"Lang ID: Can not have more segments, setting {num_segments} segments."
1941
+ )
1942
+ num_detection_segments = num_segments
1943
+
1944
+ # create a list of indices to randomly select segments from
1945
+ indices = list(range(num_detection_segments))
1946
+
1947
+ # fix seed to get deterministic results
1948
+ random.seed(0)
1949
+ random.shuffle(indices)
1950
+
1951
+ detected_languages = []
1952
+ all_language_probabilities = defaultdict(list)
1953
+ confident_language_probabilities = defaultdict(list)
1954
+ num_confident_segments_per_language = defaultdict(int)
1955
+
1956
+ # Iterate over the randomly selected indices of the segments.
1957
+ #
1958
+ # For each segment, extract features and detect language.
1959
+ #
1960
+ # If the language is confident, add it to the list of confident segments for that language.
1961
+ #
1962
+ # If the number of confident segments for a language
1963
+ # is greater than or equal to the number of detection segments,
1964
+ # return the language and the average probability of the language.
1965
+ #
1966
+ # If we are unable to get sufficient number of confident predcitions,
1967
+ # return the most frequently detected language with maximum probability.
1968
+ #
1969
+ # We need to get sufficient number of confident predictions per language, not in total.
1970
+
1971
+ for i in indices:
1972
+ segment_features = features[:, i * nb_max_frames : (i + 1) * nb_max_frames]
1973
+ try:
1974
+ encoder_output = self.encode(segment_features)
1975
+ results = self.model.detect_language(encoder_output)[0]
1976
+
1977
+ except ValueError as e: # or RuntimeError
1978
+ logging.error(f"Inference error:{e}")
1979
+
1980
+ # results is the list of classes (languages) and their probabilities (descending),
1981
+ # for eg: [('<|de|>', 0.482177734375),('<|en|>', 0.283447265625),...]
1982
+
1983
+ # take top language token and probability
1984
+ # and parse language token to strip out markers
1985
+ # for eg: '<|de|>' -> 'de'
1986
+
1987
+ language_token = results[0][0]
1988
+ language = language_token[2:-2]
1989
+
1990
+ language_probability = results[0][1]
1991
+
1992
+ detected_languages.append(language)
1993
+ all_language_probabilities[language].append(language_probability)
1994
+
1995
+ # only consider if the language prediction is confident
1996
+ if language_probability > language_threshold:
1997
+ num_confident_segments_per_language[language] += 1
1998
+
1999
+ # Add language and probability to the list of languages when it is confident
2000
+ confident_language_probabilities[language].append(language_probability)
2001
+
2002
+ # return the language when sufficient number of confident segments is achieved
2003
+ if (
2004
+ num_confident_segments_per_language[language]
2005
+ >= num_detection_segments
2006
+ ):
2007
+ # Considering the average probability of only confident segments
2008
+ mean = sum(confident_language_probabilities[language]) / len(
2009
+ confident_language_probabilities[language]
2010
+ )
2011
+ return {
2012
+ "language_code": language,
2013
+ "language_confidence": mean,
2014
+ }
2015
+
2016
+ # if we are unable to get sufficient number of confident predictions,
2017
+ # return the most frequently detected language.
2018
+ # if there is a tie, return the one with maximum average probability.
2019
+ counter = Counter(detected_languages)
2020
+
2021
+ # Define the key function to select frequent language with attached probabilities
2022
+ def key_func(language):
2023
+ # Calculate the frequency of the language
2024
+ frequency = counter[language]
2025
+
2026
+ # Calculate the average probability of the language
2027
+ prob_avg = sum(all_language_probabilities[language]) / len(
2028
+ all_language_probabilities[language]
2029
+ )
2030
+
2031
+ return frequency, prob_avg
2032
+
2033
+ if detected_languages:
2034
+ # Use the key function to find the language with maximum frequency and probability
2035
+ max_language = max(detected_languages, key=key_func)
2036
+ max_probability = sum(all_language_probabilities[max_language]) / len(
2037
+ all_language_probabilities[max_language]
2038
+ )
2039
+
2040
+ # Do additional checks for silence for non-confident case
2041
+ # calculate RMS amplitude and DC offset
2042
+ dc_offset = audio.mean()
2043
+ audio_minus_dc_offset = audio - dc_offset
2044
+ is_silent = (
2045
+ torch.all(audio.abs() < 0.01)
2046
+ or torch.sqrt(torch.mean(audio_minus_dc_offset**2)) < 0.01
2047
+ )
2048
+
2049
+ if is_silent:
2050
+ return {"language_code": None, "language_confidence": 1.0}
2051
+
2052
+ return {
2053
+ "language_code": max_language,
2054
+ "language_confidence": max_probability,
2055
+ }
2056
+
2057
+ # Language is not detected for any segment and none of prev conditions met
2058
+ return {"language_code": None, "language_confidence": 1.0}
2059
+
2060
+
2061
+ def restore_speech_timestamps(
2062
+ segments: Iterable[Segment],
2063
+ speech_chunks: List[dict],
2064
+ sampling_rate: int,
2065
+ ) -> Iterable[Segment]:
2066
+ ts_map = SpeechTimestampsMap(speech_chunks, sampling_rate)
2067
+
2068
+ for segment in segments:
2069
+ if segment.words:
2070
+ words = []
2071
+ for word in segment.words:
2072
+ # Ensure the word start and end times are resolved to the same chunk.
2073
+ middle = (word.start + word.end) / 2
2074
+ chunk_index = ts_map.get_chunk_index(middle)
2075
+ word = word._replace(
2076
+ start=ts_map.get_original_time(word.start, chunk_index),
2077
+ end=ts_map.get_original_time(word.end, chunk_index),
2078
+ )
2079
+ words.append(word)
2080
+
2081
+ segment = segment._replace(
2082
+ start=words[0].start,
2083
+ end=words[-1].end,
2084
+ words=words,
2085
+ )
2086
+
2087
+ else:
2088
+ segment = segment._replace(
2089
+ start=ts_map.get_original_time(segment.start),
2090
+ end=ts_map.get_original_time(segment.end),
2091
+ )
2092
+
2093
+ yield segment
2094
+
2095
+
2096
+ def get_ctranslate2_storage(segment: torch.Tensor) -> ctranslate2.StorageView:
2097
+ segment = segment.contiguous()
2098
+ segment = ctranslate2.StorageView.from_array(
2099
+ segment if segment.is_cuda else segment.numpy()
2100
+ ) # torch cpu tensors don't implement __array_interface__
2101
+ # https://github.com/pytorch/pytorch/issues/51156
2102
+ return segment
2103
+
2104
+
2105
+ def get_compression_ratio(text: str) -> float:
2106
+ text_bytes = text.encode("utf-8")
2107
+ return len(text_bytes) / len(zlib.compress(text_bytes))
2108
+
2109
+
2110
+ def get_suppressed_tokens(
2111
+ tokenizer: Tokenizer,
2112
+ suppress_tokens: Tuple[int],
2113
+ ) -> Optional[List[int]]:
2114
+ if -1 in suppress_tokens:
2115
+ suppress_tokens = [t for t in suppress_tokens if t >= 0]
2116
+ suppress_tokens.extend(tokenizer.non_speech_tokens)
2117
+ elif suppress_tokens is None or len(suppress_tokens) == 0:
2118
+ suppress_tokens = [] # interpret empty string as an empty list
2119
+ else:
2120
+ assert isinstance(suppress_tokens, list), "suppress_tokens must be a list"
2121
+
2122
+ suppress_tokens.extend(
2123
+ [
2124
+ tokenizer.transcribe,
2125
+ tokenizer.translate,
2126
+ tokenizer.sot,
2127
+ tokenizer.sot_prev,
2128
+ tokenizer.sot_lm,
2129
+ ]
2130
+ )
2131
+
2132
+ return tuple(sorted(set(suppress_tokens)))
2133
+
2134
+
2135
+ def merge_punctuations(alignment: List[dict], prepended: str, appended: str) -> None:
2136
+ # merge prepended punctuations
2137
+ i = len(alignment) - 2
2138
+ j = len(alignment) - 1
2139
+ while i >= 0:
2140
+ previous = alignment[i]
2141
+ following = alignment[j]
2142
+ if previous["word"].startswith(" ") and previous["word"].strip() in prepended:
2143
+ # prepend it to the following word
2144
+ following["word"] = previous["word"] + following["word"]
2145
+ if "tokens" in alignment[0].keys():
2146
+ following["tokens"] = previous["tokens"] + following["tokens"]
2147
+ previous["tokens"] = []
2148
+ previous["word"] = ""
2149
+
2150
+ else:
2151
+ j = i
2152
+ i -= 1
2153
+
2154
+ # merge appended punctuations
2155
+ i = 0
2156
+ j = 1
2157
+ while j < len(alignment):
2158
+ previous = alignment[i]
2159
+ following = alignment[j]
2160
+ if not previous["word"].endswith(" ") and following["word"] in appended:
2161
+ # append it to the previous word
2162
+ previous["word"] = previous["word"] + following["word"]
2163
+ if "tokens" in alignment[0].keys():
2164
+ previous["tokens"] = previous["tokens"] + following["tokens"]
2165
+ following["tokens"] = []
2166
+ following["word"] = ""
2167
+
2168
+ else:
2169
+ i = j
2170
+ j += 1
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/utils.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import re
4
+
5
+ from typing import List, Optional
6
+
7
+ import huggingface_hub
8
+ import requests
9
+
10
+ from tqdm.auto import tqdm
11
+
12
+ _MODELS = {
13
+ "tiny.en": "Systran/faster-whisper-tiny.en",
14
+ "tiny": "Systran/faster-whisper-tiny",
15
+ "base.en": "Systran/faster-whisper-base.en",
16
+ "base": "Systran/faster-whisper-base",
17
+ "small.en": "Systran/faster-whisper-small.en",
18
+ "small": "Systran/faster-whisper-small",
19
+ "medium.en": "Systran/faster-whisper-medium.en",
20
+ "medium": "Systran/faster-whisper-medium",
21
+ "large-v1": "Systran/faster-whisper-large-v1",
22
+ "large-v2": "Systran/faster-whisper-large-v2",
23
+ "large-v3": "Systran/faster-whisper-large-v3",
24
+ "large": "Systran/faster-whisper-large-v3",
25
+ "distil-large-v2": "Systran/faster-distil-whisper-large-v2",
26
+ "distil-medium.en": "Systran/faster-distil-whisper-medium.en",
27
+ "distil-small.en": "Systran/faster-distil-whisper-small.en",
28
+ "distil-large-v3": "Systran/faster-distil-whisper-large-v3",
29
+ }
30
+
31
+
32
+ def available_models() -> List[str]:
33
+ """Returns the names of available models."""
34
+ return list(_MODELS.keys())
35
+
36
+
37
+ def get_assets_path():
38
+ """Returns the path to the assets directory."""
39
+ return os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets")
40
+
41
+
42
+ def get_logger():
43
+ """Returns the module logger."""
44
+ return logging.getLogger("faster_whisper")
45
+
46
+
47
+ def download_model(
48
+ size_or_id: str,
49
+ output_dir: Optional[str] = None,
50
+ local_files_only: bool = False,
51
+ cache_dir: Optional[str] = None,
52
+ ):
53
+ """Downloads a CTranslate2 Whisper model from the Hugging Face Hub.
54
+
55
+ Args:
56
+ size_or_id: Size of the model to download from https://huggingface.co/Systran
57
+ (tiny, tiny.en, base, base.en, small, small.en, distil-small.en, medium, medium.en,
58
+ distil-medium.en, large-v1, large-v2, large-v3, large, distil-large-v2,
59
+ distil-large-v3), or a CTranslate2-converted model ID from the Hugging Face Hub
60
+ (e.g. Systran/faster-whisper-large-v3).
61
+ output_dir: Directory where the model should be saved. If not set, the model is saved in
62
+ the cache directory.
63
+ local_files_only: If True, avoid downloading the file and return the path to the local
64
+ cached file if it exists.
65
+ cache_dir: Path to the folder where cached files are stored.
66
+
67
+ Returns:
68
+ The path to the downloaded model.
69
+
70
+ Raises:
71
+ ValueError: if the model size is invalid.
72
+ """
73
+ if re.match(r".*/.*", size_or_id):
74
+ repo_id = size_or_id
75
+ else:
76
+ repo_id = _MODELS.get(size_or_id)
77
+ if repo_id is None:
78
+ raise ValueError(
79
+ "Invalid model size '%s', expected one of: %s"
80
+ % (size_or_id, ", ".join(_MODELS.keys()))
81
+ )
82
+
83
+ allow_patterns = [
84
+ "config.json",
85
+ "preprocessor_config.json",
86
+ "model.bin",
87
+ "tokenizer.json",
88
+ "vocabulary.*",
89
+ ]
90
+
91
+ kwargs = {
92
+ "local_files_only": local_files_only,
93
+ "allow_patterns": allow_patterns,
94
+ "tqdm_class": disabled_tqdm,
95
+ }
96
+
97
+ if output_dir is not None:
98
+ kwargs["local_dir"] = output_dir
99
+ kwargs["local_dir_use_symlinks"] = False
100
+
101
+ if cache_dir is not None:
102
+ kwargs["cache_dir"] = cache_dir
103
+
104
+ try:
105
+ return huggingface_hub.snapshot_download(repo_id, **kwargs)
106
+ except (
107
+ huggingface_hub.utils.HfHubHTTPError,
108
+ requests.exceptions.ConnectionError,
109
+ ) as exception:
110
+ logger = get_logger()
111
+ logger.warning(
112
+ "An error occured while synchronizing the model %s from the Hugging Face Hub:\n%s",
113
+ repo_id,
114
+ exception,
115
+ )
116
+ logger.warning(
117
+ "Trying to load the model directly from the local cache, if it exists."
118
+ )
119
+
120
+ kwargs["local_files_only"] = True
121
+ return huggingface_hub.snapshot_download(repo_id, **kwargs)
122
+
123
+
124
+ def format_timestamp(
125
+ seconds: float,
126
+ always_include_hours: bool = False,
127
+ decimal_marker: str = ".",
128
+ ) -> str:
129
+ assert seconds >= 0, "non-negative timestamp expected"
130
+ milliseconds = round(seconds * 1000.0)
131
+
132
+ hours = milliseconds // 3_600_000
133
+ milliseconds -= hours * 3_600_000
134
+
135
+ minutes = milliseconds // 60_000
136
+ milliseconds -= minutes * 60_000
137
+
138
+ seconds = milliseconds // 1_000
139
+ milliseconds -= seconds * 1_000
140
+
141
+ hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
142
+ return (
143
+ f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
144
+ )
145
+
146
+
147
+ class disabled_tqdm(tqdm):
148
+ def __init__(self, *args, **kwargs):
149
+ kwargs["disable"] = True
150
+ super().__init__(*args, **kwargs)
151
+
152
+
153
+ def get_end(segments: List[dict]) -> Optional[float]:
154
+ return next(
155
+ (w["end"] for s in reversed(segments) for w in reversed(s["words"])),
156
+ segments[-1]["end"] if segments else None,
157
+ )
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/vad.py ADDED
@@ -0,0 +1,596 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import bisect
2
+ import functools
3
+ import os
4
+
5
+ from abc import ABC
6
+ from collections.abc import Callable
7
+ from typing import List, NamedTuple, Optional, Union
8
+
9
+ import numpy as np
10
+ import torch
11
+
12
+ from pyannote.audio.core.io import AudioFile
13
+ from pyannote.audio.pipelines import VoiceActivityDetection
14
+ from pyannote.audio.pipelines.utils import PipelineModel
15
+ from pyannote.core import Annotation, Segment, SlidingWindowFeature
16
+
17
+ from faster_whisper.utils import get_assets_path
18
+
19
+
20
+ # The code below is adapted from https://github.com/snakers4/silero-vad.
21
+ class VadOptions(NamedTuple):
22
+ """VAD options.
23
+
24
+ Attributes:
25
+ threshold: Speech threshold. Silero VAD outputs speech probabilities for each audio chunk,
26
+ probabilities ABOVE this value are considered as SPEECH. It is better to tune this
27
+ parameter for each dataset separately, but "lazy" 0.5 is pretty good for most datasets.
28
+ min_speech_duration_ms: Final speech chunks shorter min_speech_duration_ms are thrown out.
29
+ max_speech_duration_s: Maximum duration of speech chunks in seconds. Chunks longer
30
+ than max_speech_duration_s will be split at the timestamp of the last silence that
31
+ lasts more than 100ms (if any), to prevent aggressive cutting. Otherwise, they will be
32
+ split aggressively just before max_speech_duration_s.
33
+ min_silence_duration_ms: In the end of each speech chunk wait for min_silence_duration_ms
34
+ before separating it
35
+ speech_pad_ms: Final speech chunks are padded by speech_pad_ms each side
36
+ """
37
+
38
+ threshold: float = 0.5
39
+ min_speech_duration_ms: int = 250
40
+ max_speech_duration_s: float = float("inf")
41
+ min_silence_duration_ms: int = 2000
42
+ speech_pad_ms: int = 400
43
+
44
+
45
+ def get_speech_timestamps(
46
+ audio: torch.Tensor,
47
+ vad_options: Optional[VadOptions] = None,
48
+ **kwargs,
49
+ ) -> List[dict]:
50
+ """This method is used for splitting long audios into speech chunks using silero VAD.
51
+
52
+ Args:
53
+ audio: One dimensional float array.
54
+ vad_options: Options for VAD processing.
55
+ kwargs: VAD options passed as keyword arguments for backward compatibility.
56
+
57
+ Returns:
58
+ List of dicts containing begin and end samples of each speech chunk.
59
+ """
60
+ if vad_options is None:
61
+ vad_options = VadOptions(**kwargs)
62
+
63
+ threshold = vad_options.threshold
64
+ min_speech_duration_ms = vad_options.min_speech_duration_ms
65
+ max_speech_duration_s = vad_options.max_speech_duration_s
66
+ min_silence_duration_ms = vad_options.min_silence_duration_ms
67
+ window_size_samples = 512
68
+ speech_pad_ms = vad_options.speech_pad_ms
69
+ sampling_rate = 16000
70
+ min_speech_samples = sampling_rate * min_speech_duration_ms / 1000
71
+ speech_pad_samples = sampling_rate * speech_pad_ms / 1000
72
+ max_speech_samples = (
73
+ sampling_rate * max_speech_duration_s
74
+ - window_size_samples
75
+ - 2 * speech_pad_samples
76
+ )
77
+ min_silence_samples = sampling_rate * min_silence_duration_ms / 1000
78
+ min_silence_samples_at_max_speech = sampling_rate * 98 / 1000
79
+
80
+ audio_length_samples = len(audio)
81
+
82
+ model = get_vad_model()
83
+ state, context = model.get_initial_states(batch_size=1)
84
+
85
+ speech_probs = []
86
+ for current_start_sample in range(0, audio_length_samples, window_size_samples):
87
+ chunk = audio[current_start_sample : current_start_sample + window_size_samples]
88
+ if len(chunk) < window_size_samples:
89
+ chunk = np.pad(chunk, (0, int(window_size_samples - len(chunk))))
90
+ speech_prob, state, context = model(chunk, state, context, sampling_rate)
91
+ speech_probs.append(speech_prob)
92
+
93
+ triggered = False
94
+ speeches = []
95
+ current_speech = {}
96
+ neg_threshold = threshold - 0.15
97
+
98
+ # to save potential segment end (and tolerate some silence)
99
+ temp_end = 0
100
+ # to save potential segment limits in case of maximum segment size reached
101
+ prev_end = next_start = 0
102
+
103
+ for i, speech_prob in enumerate(speech_probs):
104
+ if (speech_prob >= threshold) and temp_end:
105
+ temp_end = 0
106
+ if next_start < prev_end:
107
+ next_start = window_size_samples * i
108
+
109
+ if (speech_prob >= threshold) and not triggered:
110
+ triggered = True
111
+ current_speech["start"] = window_size_samples * i
112
+ continue
113
+
114
+ if (
115
+ triggered
116
+ and (window_size_samples * i) - current_speech["start"] > max_speech_samples
117
+ ):
118
+ if prev_end:
119
+ current_speech["end"] = prev_end
120
+ speeches.append(current_speech)
121
+ current_speech = {}
122
+ # previously reached silence (< neg_thres) and is still not speech (< thres)
123
+ if next_start < prev_end:
124
+ triggered = False
125
+ else:
126
+ current_speech["start"] = next_start
127
+ prev_end = next_start = temp_end = 0
128
+ else:
129
+ current_speech["end"] = window_size_samples * i
130
+ speeches.append(current_speech)
131
+ current_speech = {}
132
+ prev_end = next_start = temp_end = 0
133
+ triggered = False
134
+ continue
135
+
136
+ if (speech_prob < neg_threshold) and triggered:
137
+ if not temp_end:
138
+ temp_end = window_size_samples * i
139
+ # condition to avoid cutting in very short silence
140
+ if (window_size_samples * i) - temp_end > min_silence_samples_at_max_speech:
141
+ prev_end = temp_end
142
+ if (window_size_samples * i) - temp_end < min_silence_samples:
143
+ continue
144
+ else:
145
+ current_speech["end"] = temp_end
146
+ if (
147
+ current_speech["end"] - current_speech["start"]
148
+ ) > min_speech_samples:
149
+ speeches.append(current_speech)
150
+ current_speech = {}
151
+ prev_end = next_start = temp_end = 0
152
+ triggered = False
153
+ continue
154
+
155
+ if (
156
+ current_speech
157
+ and (audio_length_samples - current_speech["start"]) > min_speech_samples
158
+ ):
159
+ current_speech["end"] = audio_length_samples
160
+ speeches.append(current_speech)
161
+
162
+ for i, speech in enumerate(speeches):
163
+ if i == 0:
164
+ speech["start"] = int(max(0, speech["start"] - speech_pad_samples))
165
+ if i != len(speeches) - 1:
166
+ silence_duration = speeches[i + 1]["start"] - speech["end"]
167
+ if silence_duration < 2 * speech_pad_samples:
168
+ speech["end"] += int(silence_duration // 2)
169
+ speeches[i + 1]["start"] = int(
170
+ max(0, speeches[i + 1]["start"] - silence_duration // 2)
171
+ )
172
+ else:
173
+ speech["end"] = int(
174
+ min(audio_length_samples, speech["end"] + speech_pad_samples)
175
+ )
176
+ speeches[i + 1]["start"] = int(
177
+ max(0, speeches[i + 1]["start"] - speech_pad_samples)
178
+ )
179
+ else:
180
+ speech["end"] = int(
181
+ min(audio_length_samples, speech["end"] + speech_pad_samples)
182
+ )
183
+
184
+ return speeches
185
+
186
+
187
+ def collect_chunks(audio: torch.Tensor, chunks: List[dict]) -> torch.Tensor:
188
+ """Collects and concatenates audio chunks."""
189
+ if not chunks:
190
+ return torch.tensor([], dtype=torch.float32)
191
+
192
+ return torch.cat([audio[chunk["start"] : chunk["end"]] for chunk in chunks])
193
+
194
+
195
+ class SpeechTimestampsMap:
196
+ """Helper class to restore original speech timestamps."""
197
+
198
+ def __init__(self, chunks: List[dict], sampling_rate: int, time_precision: int = 2):
199
+ self.sampling_rate = sampling_rate
200
+ self.time_precision = time_precision
201
+ self.chunk_end_sample = []
202
+ self.total_silence_before = []
203
+
204
+ previous_end = 0
205
+ silent_samples = 0
206
+
207
+ for chunk in chunks:
208
+ silent_samples += chunk["start"] - previous_end
209
+ previous_end = chunk["end"]
210
+
211
+ self.chunk_end_sample.append(chunk["end"] - silent_samples)
212
+ self.total_silence_before.append(silent_samples / sampling_rate)
213
+
214
+ def get_original_time(
215
+ self,
216
+ time: float,
217
+ chunk_index: Optional[int] = None,
218
+ ) -> float:
219
+ if chunk_index is None:
220
+ chunk_index = self.get_chunk_index(time)
221
+
222
+ total_silence_before = self.total_silence_before[chunk_index]
223
+ return round(total_silence_before + time, self.time_precision)
224
+
225
+ def get_chunk_index(self, time: float) -> int:
226
+ sample = int(time * self.sampling_rate)
227
+ return min(
228
+ bisect.bisect(self.chunk_end_sample, sample),
229
+ len(self.chunk_end_sample) - 1,
230
+ )
231
+
232
+
233
+ @functools.lru_cache
234
+ def get_vad_model():
235
+ """Returns the VAD model instance."""
236
+ path = os.path.join(get_assets_path(), "silero_vad.onnx")
237
+ return SileroVADModel(path)
238
+
239
+
240
+ class SileroVADModel:
241
+ def __init__(self, path):
242
+ try:
243
+ import onnxruntime
244
+ except ImportError as e:
245
+ raise RuntimeError(
246
+ "Applying the VAD filter requires the onnxruntime package"
247
+ ) from e
248
+
249
+ opts = onnxruntime.SessionOptions()
250
+ opts.inter_op_num_threads = 1
251
+ opts.intra_op_num_threads = 1
252
+ opts.log_severity_level = 4
253
+
254
+ self.session = onnxruntime.InferenceSession(
255
+ path,
256
+ providers=["CPUExecutionProvider"],
257
+ sess_options=opts,
258
+ )
259
+
260
+ def get_initial_states(self, batch_size: int):
261
+ state = np.zeros((2, batch_size, 128), dtype=np.float32)
262
+ context = np.zeros((batch_size, 64), dtype=np.float32)
263
+ return state, context
264
+
265
+ def __call__(self, x, state, context, sr: int):
266
+ if len(x.shape) == 1:
267
+ x = np.expand_dims(x, 0)
268
+ if len(x.shape) > 2:
269
+ raise ValueError(
270
+ f"Too many dimensions for input audio chunk {len(x.shape)}"
271
+ )
272
+ if sr / x.shape[1] > 31.25:
273
+ raise ValueError("Input audio chunk is too short")
274
+
275
+ x = np.concatenate([context, x], axis=1)
276
+
277
+ ort_inputs = {
278
+ "input": x,
279
+ "state": state,
280
+ "sr": np.array(sr, dtype="int64"),
281
+ }
282
+
283
+ out, state = self.session.run(None, ort_inputs)
284
+ context = x[..., -64:]
285
+
286
+ return out, state, context
287
+
288
+
289
+ # BSD 2-Clause License
290
+
291
+ # Copyright (c) 2024, Max Bain
292
+
293
+ # Redistribution and use in source and binary forms, with or without
294
+ # modification, are permitted provided that the following conditions are met:
295
+
296
+ # 1. Redistributions of source code must retain the above copyright notice, this
297
+ # list of conditions and the following disclaimer.
298
+
299
+ # 2. Redistributions in binary form must reproduce the above copyright notice,
300
+ # this list of conditions and the following disclaimer in the documentation
301
+ # and/or other materials provided with the distribution.
302
+
303
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
304
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
305
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
306
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
307
+ # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
308
+ # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
309
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
310
+ # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
311
+ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
312
+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
313
+
314
+
315
+ # The code below is copied from whisper-x (https://github.com/m-bain/whisperX)
316
+ # and adapted for faster_whisper.
317
+ class SegmentX:
318
+ def __init__(self, start, end, speaker=None):
319
+ self.start = start
320
+ self.end = end
321
+ self.speaker = speaker
322
+
323
+
324
+ class VoiceActivitySegmentation(VoiceActivityDetection, ABC):
325
+ """Pipeline wrapper class for Voice Activity Segmentation based on VAD scores."""
326
+
327
+ def __init__(
328
+ self,
329
+ segmentation: PipelineModel = "pyannote/segmentation",
330
+ device: Optional[Union[str, torch.device]] = None,
331
+ fscore: bool = False,
332
+ use_auth_token: Optional[str] = None,
333
+ **inference_kwargs,
334
+ ):
335
+ """Initialize the pipeline with the model name and the optional device.
336
+
337
+ Args:
338
+ dict parameters of VoiceActivityDetection class from pyannote:
339
+ segmentation (PipelineModel): Loaded model name.
340
+ device (torch.device or None): Device to perform the segmentation.
341
+ fscore (bool): Flag indicating whether to compute F-score during inference.
342
+ use_auth_token (str or None): Optional authentication token for model access.
343
+ inference_kwargs (dict): Additional arguments from VoiceActivityDetection pipeline.
344
+ """
345
+ super().__init__(
346
+ segmentation=segmentation,
347
+ device=device,
348
+ fscore=fscore,
349
+ use_auth_token=use_auth_token,
350
+ **inference_kwargs,
351
+ )
352
+
353
+ def apply(
354
+ self, file: AudioFile, hook: Optional[Callable] = None
355
+ ) -> SlidingWindowFeature:
356
+ """Apply voice activity detection on the audio file.
357
+
358
+ Args:
359
+ file (AudioFile): Processed file.
360
+ hook (callable): Hook called with signature: hook("step_name", step_artefact, file=file)
361
+
362
+ Returns:
363
+ segmentations (SlidingWindowFeature): Voice activity segmentation.
364
+ """
365
+ # setup hook (e.g. for debugging purposes)
366
+ hook = self.setup_hook(file, hook=hook)
367
+
368
+ # apply segmentation model if needed
369
+ # output shape is (num_chunks, num_frames, 1)
370
+ if self.training:
371
+ if self.CACHED_SEGMENTATION in file:
372
+ segmentations = file[self.CACHED_SEGMENTATION]
373
+ else:
374
+ segmentations = self._segmentation(file)
375
+ file[self.CACHED_SEGMENTATION] = segmentations
376
+ else:
377
+ segmentations: SlidingWindowFeature = self._segmentation(file)
378
+
379
+ return segmentations
380
+
381
+
382
+ class BinarizeVadScores:
383
+ """Binarize detection scores using hysteresis thresholding.
384
+
385
+ Reference:
386
+ Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of
387
+ RNN-based Voice Activity Detection", InterSpeech 2015.
388
+
389
+ Modified by Max Bain to include WhisperX's min-cut operation
390
+ https://arxiv.org/abs/2303.00747
391
+
392
+ """
393
+
394
+ def __init__(
395
+ self,
396
+ onset: float = 0.5,
397
+ offset: Optional[float] = None,
398
+ min_duration_on: float = 0.0,
399
+ min_duration_off: float = 0.0,
400
+ pad_onset: float = 0.0,
401
+ pad_offset: float = 0.0,
402
+ max_duration: float = float("inf"),
403
+ ):
404
+ """Initializes the parameters for Binarizing the VAD scores.
405
+
406
+ Args:
407
+ onset (float, optional):
408
+ Onset threshold. Defaults to 0.5.
409
+ offset (float, optional):
410
+ Offset threshold. Defaults to `onset`.
411
+ min_duration_on (float, optional):
412
+ Remove active regions shorter than that many seconds. Defaults to 0s.
413
+ min_duration_off (float, optional):
414
+ Fill inactive regions shorter than that many seconds. Defaults to 0s.
415
+ pad_onset (float, optional):
416
+ Extend active regions by moving their start time by that many seconds.
417
+ Defaults to 0s.
418
+ pad_offset (float, optional):
419
+ Extend active regions by moving their end time by that many seconds.
420
+ Defaults to 0s.
421
+ max_duration (float):
422
+ The maximum length of an active segment.
423
+ """
424
+ super().__init__()
425
+
426
+ self.onset = onset
427
+ self.offset = offset or onset
428
+
429
+ self.pad_onset = pad_onset
430
+ self.pad_offset = pad_offset
431
+
432
+ self.min_duration_on = min_duration_on
433
+ self.min_duration_off = min_duration_off
434
+
435
+ self.max_duration = max_duration
436
+
437
+ def __get_active_regions(self, scores: SlidingWindowFeature) -> Annotation:
438
+ """Extract active regions from VAD scores.
439
+
440
+ Args:
441
+ scores (SlidingWindowFeature): Detection scores.
442
+
443
+ Returns:
444
+ active (Annotation): Active regions.
445
+ """
446
+ num_frames, num_classes = scores.data.shape
447
+ frames = scores.sliding_window
448
+ timestamps = [frames[i].middle for i in range(num_frames)]
449
+ # annotation meant to store 'active' regions
450
+ active = Annotation()
451
+ for k, k_scores in enumerate(scores.data.T):
452
+ label = k if scores.labels is None else scores.labels[k]
453
+
454
+ # initial state
455
+ start = timestamps[0]
456
+ is_active = k_scores[0] > self.onset
457
+ curr_scores = [k_scores[0]]
458
+ curr_timestamps = [start]
459
+ t = start
460
+ # optionally add `strict=False` for python 3.10 or later
461
+ for t, y in zip(timestamps[1:], k_scores[1:]):
462
+ # currently active
463
+ if is_active:
464
+ curr_duration = t - start
465
+ if curr_duration > self.max_duration:
466
+ search_after = len(curr_scores) // 2
467
+ # divide segment
468
+ min_score_div_idx = search_after + np.argmin(
469
+ curr_scores[search_after:]
470
+ )
471
+ min_score_t = curr_timestamps[min_score_div_idx]
472
+ region = Segment(
473
+ start - self.pad_onset, min_score_t + self.pad_offset
474
+ )
475
+ active[region, k] = label
476
+ start = curr_timestamps[min_score_div_idx]
477
+ curr_scores = curr_scores[min_score_div_idx + 1 :]
478
+ curr_timestamps = curr_timestamps[min_score_div_idx + 1 :]
479
+ # switching from active to inactive
480
+ elif y < self.offset:
481
+ region = Segment(start - self.pad_onset, t + self.pad_offset)
482
+ active[region, k] = label
483
+ start = t
484
+ is_active = False
485
+ curr_scores = []
486
+ curr_timestamps = []
487
+ curr_scores.append(y)
488
+ curr_timestamps.append(t)
489
+ # currently inactive
490
+ else:
491
+ # switching from inactive to active
492
+ if y > self.onset:
493
+ start = t
494
+ is_active = True
495
+
496
+ # if active at the end, add final region
497
+ if is_active:
498
+ region = Segment(start - self.pad_onset, t + self.pad_offset)
499
+ active[region, k] = label
500
+
501
+ return active
502
+
503
+ def __call__(self, scores: SlidingWindowFeature) -> Annotation:
504
+ """Binarize detection scores.
505
+
506
+ Args:
507
+ scores (SlidingWindowFeature): Detection scores.
508
+
509
+ Returns:
510
+ active (Annotation): Binarized scores.
511
+ """
512
+ active = self.__get_active_regions(scores)
513
+ # because of padding, some active regions might be overlapping: merge them.
514
+ # also: fill same speaker gaps shorter than min_duration_off
515
+ if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0:
516
+ if self.max_duration < float("inf"):
517
+ raise NotImplementedError("This would break current max_duration param")
518
+ active = active.support(collar=self.min_duration_off)
519
+
520
+ # remove tracks shorter than min_duration_on
521
+ if self.min_duration_on > 0:
522
+ for segment, track in list(active.itertracks()):
523
+ if segment.duration < self.min_duration_on:
524
+ del active[segment, track]
525
+
526
+ return active
527
+
528
+
529
+ def merge_chunks(
530
+ segments,
531
+ chunk_length,
532
+ onset: float = 0.5,
533
+ offset: Optional[float] = None,
534
+ edge_padding: float = 0.1,
535
+ ):
536
+ """
537
+ Merge operation described in whisper-x paper
538
+ """
539
+ curr_end = 0
540
+ merged_segments = []
541
+ seg_idxs = []
542
+ speaker_idxs = []
543
+
544
+ assert chunk_length > 0
545
+ binarize = BinarizeVadScores(max_duration=chunk_length, onset=onset, offset=offset)
546
+ segments = binarize(segments)
547
+ segments_list = []
548
+ for speech_turn in segments.get_timeline():
549
+ segments_list.append(
550
+ SegmentX(
551
+ max(0.0, speech_turn.start - edge_padding),
552
+ speech_turn.end + edge_padding,
553
+ "UNKNOWN",
554
+ )
555
+ ) # 100ms edge padding to account for edge errors
556
+
557
+ if len(segments_list) == 0:
558
+ print("No active speech found in audio")
559
+ return []
560
+
561
+ # Make sur the starting point is the start of the segment.
562
+ curr_start = segments_list[0].start
563
+
564
+ for idx, seg in enumerate(segments_list):
565
+ # if any segment start timing is less than previous segment end timing,
566
+ # reset the edge padding. Similarly for end timing.
567
+ if idx > 0:
568
+ if seg.start < segments_list[idx - 1].end:
569
+ seg.start += edge_padding
570
+ if idx < len(segments_list) - 1:
571
+ if seg.end > segments_list[idx + 1].start:
572
+ seg.end -= edge_padding
573
+
574
+ if seg.end - curr_start > chunk_length and curr_end - curr_start > 0:
575
+ merged_segments.append(
576
+ {
577
+ "start": curr_start,
578
+ "end": curr_end,
579
+ "segments": seg_idxs,
580
+ }
581
+ )
582
+ curr_start = seg.start
583
+ seg_idxs = []
584
+ speaker_idxs = []
585
+ curr_end = seg.end
586
+ seg_idxs.append((seg.start, seg.end))
587
+ speaker_idxs.append(seg.speaker)
588
+ # add final
589
+ merged_segments.append(
590
+ {
591
+ "start": curr_start,
592
+ "end": curr_end,
593
+ "segments": seg_idxs,
594
+ }
595
+ )
596
+ return merged_segments
whisper_pipeline/faster-whisper-main/build/lib/faster_whisper/version.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """Version information."""
2
+
3
+ __version__ = "1.0.3"
whisper_pipeline/faster-whisper-main/docker/Dockerfile ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
2
+ WORKDIR /root
3
+ RUN apt-get update -y && apt-get install -y python3-pip
4
+ COPY infer.py jfk.flac ./
5
+ RUN pip3 install faster-whisper
6
+ CMD ["python3", "infer.py"]
whisper_pipeline/faster-whisper-main/docker/infer.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from faster_whisper import WhisperModel
2
+
3
+ jfk_path = "jfk.flac"
4
+ model = WhisperModel("tiny", device="cuda")
5
+ segments, info = model.transcribe(jfk_path, word_timestamps=True)
6
+ for segment in segments:
7
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
whisper_pipeline/faster-whisper-main/docker/jfk.flac ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63a4b1e4c1dc655ac70961ffbf518acd249df237e5a0152faae9a4a836949715
3
+ size 1152693
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/PKG-INFO ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Metadata-Version: 2.1
2
+ Name: faster-whisper
3
+ Version: 1.0.3
4
+ Summary: Faster Whisper transcription with CTranslate2
5
+ Home-page: https://github.com/SYSTRAN/faster-whisper
6
+ Author: Guillaume Klein
7
+ License: MIT
8
+ Keywords: openai whisper speech ctranslate2 inference quantization transformer
9
+ Platform: UNKNOWN
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Programming Language :: Python :: 3
15
+ Classifier: Programming Language :: Python :: 3 :: Only
16
+ Classifier: Programming Language :: Python :: 3.8
17
+ Classifier: Programming Language :: Python :: 3.9
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Requires-Python: >=3.8
22
+ Description-Content-Type: text/markdown
23
+ Provides-Extra: conversion
24
+ Provides-Extra: dev
25
+ License-File: LICENSE
26
+
27
+ [![CI](https://github.com/SYSTRAN/faster-whisper/workflows/CI/badge.svg)](https://github.com/SYSTRAN/faster-whisper/actions?query=workflow%3ACI) [![PyPI version](https://badge.fury.io/py/faster-whisper.svg)](https://badge.fury.io/py/faster-whisper)
28
+
29
+ # Faster Whisper transcription with CTranslate2
30
+
31
+ **faster-whisper** is a reimplementation of OpenAI's Whisper model using [CTranslate2](https://github.com/OpenNMT/CTranslate2/), which is a fast inference engine for Transformer models.
32
+
33
+ This implementation is up to 4 times faster than [openai/whisper](https://github.com/openai/whisper) for the same accuracy while using less memory. The efficiency can be further improved with 8-bit quantization on both CPU and GPU.
34
+
35
+ ## Benchmark
36
+
37
+ ### Whisper
38
+
39
+ For reference, here's the time and memory usage that are required to transcribe [**13 minutes**](https://www.youtube.com/watch?v=0u7tTptBo9I) of audio using different implementations:
40
+
41
+ * [openai/whisper](https://github.com/openai/whisper)@[6dea21fd](https://github.com/openai/whisper/commit/6dea21fd7f7253bfe450f1e2512a0fe47ee2d258)
42
+ * [whisper.cpp](https://github.com/ggerganov/whisper.cpp)@[3b010f9](https://github.com/ggerganov/whisper.cpp/commit/3b010f9bed9a6068609e9faf52383aea792b0362)
43
+ * [faster-whisper](https://github.com/SYSTRAN/faster-whisper)@[cce6b53e](https://github.com/SYSTRAN/faster-whisper/commit/cce6b53e4554f71172dad188c45f10fb100f6e3e)
44
+
45
+ ### Large-v2 model on GPU
46
+
47
+ | Implementation | Precision | Beam size | Time | Max. GPU memory | Max. CPU memory |
48
+ | --- | --- | --- | --- | --- | --- |
49
+ | openai/whisper | fp16 | 5 | 4m30s | 11325MB | 9439MB |
50
+ | faster-whisper | fp16 | 5 | 54s | 4755MB | 3244MB |
51
+ | faster-whisper | int8 | 5 | 59s | 3091MB | 3117MB |
52
+
53
+ *Executed with CUDA 11.7.1 on a NVIDIA Tesla V100S.*
54
+
55
+ ### Small model on CPU
56
+
57
+ | Implementation | Precision | Beam size | Time | Max. memory |
58
+ | --- | --- | --- | --- | --- |
59
+ | openai/whisper | fp32 | 5 | 10m31s | 3101MB |
60
+ | whisper.cpp | fp32 | 5 | 17m42s | 1581MB |
61
+ | whisper.cpp | fp16 | 5 | 12m39s | 873MB |
62
+ | faster-whisper | fp32 | 5 | 2m44s | 1675MB |
63
+ | faster-whisper | int8 | 5 | 2m04s | 995MB |
64
+
65
+ *Executed with 8 threads on a Intel(R) Xeon(R) Gold 6226R.*
66
+
67
+
68
+ ### Distil-whisper
69
+
70
+ | Implementation | Precision | Beam size | Time | Gigaspeech WER |
71
+ | --- | --- | --- | --- | --- |
72
+ | distil-whisper/distil-large-v2 | fp16 | 4 |- | 10.36 |
73
+ | [faster-distil-large-v2](https://huggingface.co/Systran/faster-distil-whisper-large-v2) | fp16 | 5 | - | 10.28 |
74
+ | distil-whisper/distil-medium.en | fp16 | 4 | - | 11.21 |
75
+ | [faster-distil-medium.en](https://huggingface.co/Systran/faster-distil-whisper-medium.en) | fp16 | 5 | - | 11.21 |
76
+
77
+ *Executed with CUDA 11.4 on a NVIDIA 3090.*
78
+
79
+ <details>
80
+ <summary>testing details (click to expand)</summary>
81
+
82
+ For `distil-whisper/distil-large-v2`, the WER is tested with code sample from [link](https://huggingface.co/distil-whisper/distil-large-v2#evaluation). for `faster-distil-whisper`, the WER is tested with setting:
83
+ ```python
84
+ from faster_whisper import WhisperModel
85
+
86
+ model_size = "distil-large-v2"
87
+ # model_size = "distil-medium.en"
88
+ # Run on GPU with FP16
89
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
90
+ segments, info = model.transcribe("audio.mp3", beam_size=5, language="en")
91
+ ```
92
+ </details>
93
+
94
+ ## Requirements
95
+
96
+ * Python 3.8 or greater
97
+
98
+
99
+ ### GPU
100
+
101
+ GPU execution requires the following NVIDIA libraries to be installed:
102
+
103
+ * [cuBLAS for CUDA 12](https://developer.nvidia.com/cublas)
104
+ * [cuDNN 8 for CUDA 12](https://developer.nvidia.com/cudnn)
105
+
106
+ **Note**: Latest versions of `ctranslate2` support CUDA 12 only. For CUDA 11, the current workaround is downgrading to the `3.24.0` version of `ctranslate2` (This can be done with `pip install --force-reinstall ctranslate2==3.24.0` or specifying the version in a `requirements.txt`).
107
+
108
+ There are multiple ways to install the NVIDIA libraries mentioned above. The recommended way is described in the official NVIDIA documentation, but we also suggest other installation methods below.
109
+
110
+ <details>
111
+ <summary>Other installation methods (click to expand)</summary>
112
+
113
+
114
+ **Note:** For all these methods below, keep in mind the above note regarding CUDA versions. Depending on your setup, you may need to install the _CUDA 11_ versions of libraries that correspond to the CUDA 12 libraries listed in the instructions below.
115
+
116
+ #### Use Docker
117
+
118
+ The libraries (cuBLAS, cuDNN) are installed in these official NVIDIA CUDA Docker images: `nvidia/cuda:12.0.0-runtime-ubuntu20.04` or `nvidia/cuda:12.0.0-runtime-ubuntu22.04`.
119
+
120
+ #### Install with `pip` (Linux only)
121
+
122
+ On Linux these libraries can be installed with `pip`. Note that `LD_LIBRARY_PATH` must be set before launching Python.
123
+
124
+ ```bash
125
+ pip install nvidia-cublas-cu12 nvidia-cudnn-cu12
126
+
127
+ export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
128
+ ```
129
+
130
+ **Note**: Version 9+ of `nvidia-cudnn-cu12` appears to cause issues due its reliance on cuDNN 9 (Faster-Whisper does not currently support cuDNN 9). Ensure your version of the Python package is for cuDNN 8.
131
+
132
+ #### Download the libraries from Purfview's repository (Windows & Linux)
133
+
134
+ Purfview's [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) provides the required NVIDIA libraries for Windows & Linux in a [single archive](https://github.com/Purfview/whisper-standalone-win/releases/tag/libs). Decompress the archive and place the libraries in a directory included in the `PATH`.
135
+
136
+ </details>
137
+
138
+ ## Installation
139
+
140
+ The module can be installed from [PyPI](https://pypi.org/project/faster-whisper/):
141
+
142
+ ```bash
143
+ pip install faster-whisper
144
+ ```
145
+
146
+ <details>
147
+ <summary>Other installation methods (click to expand)</summary>
148
+
149
+ ### Install the master branch
150
+
151
+ ```bash
152
+ pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/refs/heads/master.tar.gz"
153
+ ```
154
+
155
+ ### Install a specific commit
156
+
157
+ ```bash
158
+ pip install --force-reinstall "faster-whisper @ https://github.com/SYSTRAN/faster-whisper/archive/a4f1cc8f11433e454c3934442b5e1a4ed5e865c3.tar.gz"
159
+ ```
160
+
161
+ </details>
162
+
163
+ ## Usage
164
+
165
+ ### Faster-whisper
166
+
167
+ ```python
168
+ from faster_whisper import WhisperModel
169
+
170
+ model_size = "large-v3"
171
+
172
+ # Run on GPU with FP16
173
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
174
+
175
+ # or run on GPU with INT8
176
+ # model = WhisperModel(model_size, device="cuda", compute_type="int8_float16")
177
+ # or run on CPU with INT8
178
+ # model = WhisperModel(model_size, device="cpu", compute_type="int8")
179
+
180
+ segments, info = model.transcribe("audio.mp3", beam_size=5)
181
+
182
+ print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
183
+
184
+ for segment in segments:
185
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
186
+ ```
187
+
188
+ **Warning:** `segments` is a *generator* so the transcription only starts when you iterate over it. The transcription can be run to completion by gathering the segments in a list or a `for` loop:
189
+
190
+ ```python
191
+ segments, _ = model.transcribe("audio.mp3")
192
+ segments = list(segments) # The transcription will actually run here.
193
+ ```
194
+
195
+ ### multi-segment language detection
196
+
197
+ To directly use the model for improved language detection, the following code snippet can be used:
198
+
199
+ ```python
200
+ from faster_whisper import WhisperModel
201
+ model = WhisperModel("medium", device="cuda", compute_type="float16")
202
+ language_info = model.detect_language_multi_segment("audio.mp3")
203
+ ```
204
+
205
+ ### Batched faster-whisper
206
+
207
+
208
+ The batched version of faster-whisper is inspired by [whisper-x](https://github.com/m-bain/whisperX) licensed under the BSD-2 Clause license and integrates its VAD model to this library. We modify this implementation and also replaced the feature extraction with a faster torch-based implementation. Batched version improves the speed upto 10-12x compared to openAI implementation and 3-4x compared to the sequential faster_whisper version. It works by transcribing semantically meaningful audio chunks as batches leading to faster inference.
209
+
210
+ The following code snippet illustrates how to run inference with batched version on an example audio file. Please also refer to the test scripts of batched faster whisper.
211
+
212
+ ```python
213
+ from faster_whisper import WhisperModel, BatchedInferencePipeline
214
+
215
+ model = WhisperModel("medium", device="cuda", compute_type="float16")
216
+ batched_model = BatchedInferencePipeline(model=model)
217
+ segments, info = batched_model.transcribe("audio.mp3", batch_size=16)
218
+
219
+ for segment in segments:
220
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
221
+ ```
222
+
223
+ ### Faster Distil-Whisper
224
+
225
+ The Distil-Whisper checkpoints are compatible with the Faster-Whisper package. In particular, the latest [distil-large-v3](https://huggingface.co/distil-whisper/distil-large-v3)
226
+ checkpoint is intrinsically designed to work with the Faster-Whisper transcription algorithm. The following code snippet
227
+ demonstrates how to run inference with distil-large-v3 on a specified audio file:
228
+
229
+ ```python
230
+ from faster_whisper import WhisperModel
231
+
232
+ model_size = "distil-large-v3"
233
+
234
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
235
+ segments, info = model.transcribe("audio.mp3", beam_size=5, language="en", condition_on_previous_text=False)
236
+
237
+ for segment in segments:
238
+ print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
239
+ ```
240
+
241
+ For more information about the distil-large-v3 model, refer to the original [model card](https://huggingface.co/distil-whisper/distil-large-v3).
242
+
243
+ ### Word-level timestamps
244
+
245
+ ```python
246
+ segments, _ = model.transcribe("audio.mp3", word_timestamps=True)
247
+
248
+ for segment in segments:
249
+ for word in segment.words:
250
+ print("[%.2fs -> %.2fs] %s" % (word.start, word.end, word.word))
251
+ ```
252
+
253
+ ### VAD filter
254
+
255
+ The library integrates the [Silero VAD](https://github.com/snakers4/silero-vad) model to filter out parts of the audio without speech:
256
+
257
+ ```python
258
+ segments, _ = model.transcribe("audio.mp3", vad_filter=True)
259
+ ```
260
+
261
+ The default behavior is conservative and only removes silence longer than 2 seconds. See the available VAD parameters and default values in the [source code](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/vad.py). They can be customized with the dictionary argument `vad_parameters`:
262
+
263
+ ```python
264
+ segments, _ = model.transcribe(
265
+ "audio.mp3",
266
+ vad_filter=True,
267
+ vad_parameters=dict(min_silence_duration_ms=500),
268
+ )
269
+ ```
270
+
271
+ ### Logging
272
+
273
+ The library logging level can be configured like this:
274
+
275
+ ```python
276
+ import logging
277
+
278
+ logging.basicConfig()
279
+ logging.getLogger("faster_whisper").setLevel(logging.DEBUG)
280
+ ```
281
+
282
+ ### Going further
283
+
284
+ See more model and transcription options in the [`WhisperModel`](https://github.com/SYSTRAN/faster-whisper/blob/master/faster_whisper/transcribe.py) class implementation.
285
+
286
+ ## Community integrations
287
+
288
+ Here is a non exhaustive list of open-source projects using faster-whisper. Feel free to add your project to the list!
289
+
290
+
291
+ * [faster-whisper-server](https://github.com/fedirz/faster-whisper-server) is an OpenAI compatible server using `faster-whisper`. It's easily deployable with Docker, works with OpenAI SDKs/CLI, supports streaming, and live transcription.
292
+ * [WhisperX](https://github.com/m-bain/whisperX) is an award-winning Python library that offers speaker diarization and accurate word-level timestamps using wav2vec2 alignment
293
+ * [whisper-ctranslate2](https://github.com/Softcatala/whisper-ctranslate2) is a command line client based on faster-whisper and compatible with the original client from openai/whisper.
294
+ * [whisper-diarize](https://github.com/MahmoudAshraf97/whisper-diarization) is a speaker diarization tool that is based on faster-whisper and NVIDIA NeMo.
295
+ * [whisper-standalone-win](https://github.com/Purfview/whisper-standalone-win) Standalone CLI executables of faster-whisper for Windows, Linux & macOS.
296
+ * [asr-sd-pipeline](https://github.com/hedrergudene/asr-sd-pipeline) provides a scalable, modular, end to end multi-speaker speech to text solution implemented using AzureML pipelines.
297
+ * [Open-Lyrics](https://github.com/zh-plus/Open-Lyrics) is a Python library that transcribes voice files using faster-whisper, and translates/polishes the resulting text into `.lrc` files in the desired language using OpenAI-GPT.
298
+ * [wscribe](https://github.com/geekodour/wscribe) is a flexible transcript generation tool supporting faster-whisper, it can export word level transcript and the exported transcript then can be edited with [wscribe-editor](https://github.com/geekodour/wscribe-editor)
299
+ * [aTrain](https://github.com/BANDAS-Center/aTrain) is a graphical user interface implementation of faster-whisper developed at the BANDAS-Center at the University of Graz for transcription and diarization in Windows ([Windows Store App](https://apps.microsoft.com/detail/atrain/9N15Q44SZNS2)) and Linux.
300
+ * [Whisper-Streaming](https://github.com/ufal/whisper_streaming) implements real-time mode for offline Whisper-like speech-to-text models with faster-whisper as the most recommended back-end. It implements a streaming policy with self-adaptive latency based on the actual source complexity, and demonstrates the state of the art.
301
+ * [WhisperLive](https://github.com/collabora/WhisperLive) is a nearly-live implementation of OpenAI's Whisper which uses faster-whisper as the backend to transcribe audio in real-time.
302
+ * [Faster-Whisper-Transcriber](https://github.com/BBC-Esq/ctranslate2-faster-whisper-transcriber) is a simple but reliable voice transcriber that provides a user-friendly interface.
303
+
304
+ ## Model conversion
305
+
306
+ When loading a model from its size such as `WhisperModel("large-v3")`, the corresponding CTranslate2 model is automatically downloaded from the [Hugging Face Hub](https://huggingface.co/Systran).
307
+
308
+ We also provide a script to convert any Whisper models compatible with the Transformers library. They could be the original OpenAI models or user fine-tuned models.
309
+
310
+ For example the command below converts the [original "large-v3" Whisper model](https://huggingface.co/openai/whisper-large-v3) and saves the weights in FP16:
311
+
312
+ ```bash
313
+ pip install transformers[torch]>=4.23
314
+
315
+ ct2-transformers-converter --model openai/whisper-large-v3 --output_dir whisper-large-v3-ct2
316
+ --copy_files tokenizer.json preprocessor_config.json --quantization float16
317
+ ```
318
+
319
+ * The option `--model` accepts a model name on the Hub or a path to a model directory.
320
+ * If the option `--copy_files tokenizer.json` is not used, the tokenizer configuration is automatically downloaded when the model is loaded later.
321
+
322
+ Models can also be converted from the code. See the [conversion API](https://opennmt.net/CTranslate2/python/ctranslate2.converters.TransformersConverter.html).
323
+
324
+ ### Load a converted model
325
+
326
+ 1. Directly load the model from a local directory:
327
+ ```python
328
+ model = faster_whisper.WhisperModel("whisper-large-v3-ct2")
329
+ ```
330
+
331
+ 2. [Upload your model to the Hugging Face Hub](https://huggingface.co/docs/transformers/model_sharing#upload-with-the-web-interface) and load it from its name:
332
+ ```python
333
+ model = faster_whisper.WhisperModel("username/whisper-large-v3-ct2")
334
+ ```
335
+
336
+ ## Comparing performance against other implementations
337
+
338
+ If you are comparing the performance against other Whisper implementations, you should make sure to run the comparison with similar settings. In particular:
339
+
340
+ * Verify that the same transcription options are used, especially the same beam size. For example in openai/whisper, `model.transcribe` uses a default beam size of 1 but here we use a default beam size of 5.
341
+ * When running on CPU, make sure to set the same number of threads. Many frameworks will read the environment variable `OMP_NUM_THREADS`, which can be set when running your script:
342
+
343
+ ```bash
344
+ OMP_NUM_THREADS=4 python3 my_script.py
345
+ ```
346
+
347
+
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/SOURCES.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ requirements.conversion.txt
5
+ requirements.txt
6
+ setup.cfg
7
+ setup.py
8
+ faster_whisper/__init__.py
9
+ faster_whisper/audio.py
10
+ faster_whisper/feature_extractor.py
11
+ faster_whisper/tokenizer.py
12
+ faster_whisper/transcribe.py
13
+ faster_whisper/utils.py
14
+ faster_whisper/vad.py
15
+ faster_whisper/version.py
16
+ faster_whisper.egg-info/PKG-INFO
17
+ faster_whisper.egg-info/SOURCES.txt
18
+ faster_whisper.egg-info/dependency_links.txt
19
+ faster_whisper.egg-info/requires.txt
20
+ faster_whisper.egg-info/top_level.txt
21
+ faster_whisper/assets/__init__.py
22
+ faster_whisper/assets/pyannote_vad_model.bin
23
+ faster_whisper/assets/silero_vad.onnx
24
+ tests/test_transcribe.py
25
+ tests/test_utils.py
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/dependency_links.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/requires.txt ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ctranslate2<5,>=4.0
2
+ huggingface_hub>=0.13
3
+ onnxruntime<2,>=1.14
4
+ pyannote-audio
5
+ tokenizers<1,>=0.13
6
+ torch
7
+ torchaudio
8
+ tqdm
9
+
10
+ [conversion]
11
+ transformers[torch]>=4.23
12
+
13
+ [dev]
14
+ black==23.*
15
+ flake8==6.*
16
+ isort==5.*
17
+ pytest==7.*
whisper_pipeline/faster-whisper-main/faster_whisper.egg-info/top_level.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ faster_whisper
whisper_pipeline/faster-whisper-main/faster_whisper/__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from faster_whisper.audio import decode_audio
2
+ from faster_whisper.transcribe import BatchedInferencePipeline, WhisperModel
3
+ from faster_whisper.utils import available_models, download_model, format_timestamp
4
+ from faster_whisper.version import __version__
5
+
6
+ __all__ = [
7
+ "available_models",
8
+ "decode_audio",
9
+ "WhisperModel",
10
+ "BatchedInferencePipeline",
11
+ "download_model",
12
+ "format_timestamp",
13
+ "__version__",
14
+ ]
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (572 Bytes). View file
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/audio.cpython-310.pyc ADDED
Binary file (1.59 kB). View file
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/feature_extractor.cpython-310.pyc ADDED
Binary file (2.73 kB). View file
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/tokenizer.cpython-310.pyc ADDED
Binary file (6.78 kB). View file
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/transcribe.cpython-310.pyc ADDED
Binary file (53.3 kB). View file
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/utils.cpython-310.pyc ADDED
Binary file (5.13 kB). View file
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/vad.cpython-310.pyc ADDED
Binary file (15.2 kB). View file
 
whisper_pipeline/faster-whisper-main/faster_whisper/__pycache__/version.cpython-310.pyc ADDED
Binary file (248 Bytes). View file
 
whisper_pipeline/faster-whisper-main/faster_whisper/assets/__init__.py ADDED
File without changes
whisper_pipeline/faster-whisper-main/faster_whisper/assets/pyannote_vad_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea
3
+ size 17719103
whisper_pipeline/faster-whisper-main/faster_whisper/assets/silero_vad.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b99cbfd39246b6706f98ec13c7c50c6b299181f2474fa05cbc8046acc274396
3
+ size 2313101
whisper_pipeline/faster-whisper-main/faster_whisper/audio.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import BinaryIO, Union
2
+
3
+ import torch
4
+ import torchaudio
5
+
6
+
7
+ def decode_audio(
8
+ input_file: Union[str, BinaryIO],
9
+ sampling_rate: int = 16000,
10
+ split_stereo: bool = False,
11
+ ):
12
+ """Decodes the audio.
13
+
14
+ Args:
15
+ input_file: Path to the input file or a file-like object.
16
+ sampling_rate: Resample the audio to this sample rate.
17
+ split_stereo: Return separate left and right channels.
18
+
19
+ Returns:
20
+ A float32 Torch Tensor.
21
+
22
+ If `split_stereo` is enabled, the function returns a 2-tuple with the
23
+ separated left and right channels.
24
+ """
25
+
26
+ waveform, audio_sf = torchaudio.load(input_file) # waveform: channels X T
27
+
28
+ if audio_sf != sampling_rate:
29
+ waveform = torchaudio.functional.resample(
30
+ waveform, orig_freq=audio_sf, new_freq=sampling_rate
31
+ )
32
+ if split_stereo:
33
+ return waveform[0], waveform[1]
34
+
35
+ return waveform.mean(0)
36
+
37
+
38
+ def pad_or_trim(array, length: int, *, axis: int = -1):
39
+ """
40
+ Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
41
+ """
42
+ axis = axis % array.ndim
43
+ if array.shape[axis] > length:
44
+ idx = [Ellipsis] * axis + [slice(length)] + [Ellipsis] * (array.ndim - axis - 1)
45
+ return array[idx]
46
+
47
+ if array.shape[axis] < length:
48
+ pad_widths = (
49
+ [
50
+ 0,
51
+ ]
52
+ * array.ndim
53
+ * 2
54
+ )
55
+ pad_widths[2 * axis] = length - array.shape[axis]
56
+ array = torch.nn.functional.pad(array, tuple(pad_widths[::-1]))
57
+
58
+ return array
whisper_pipeline/faster-whisper-main/faster_whisper/feature_extractor.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+
4
+ # Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/whisper/feature_extraction_whisper.py # noqa: E501
5
+ class FeatureExtractor:
6
+ def __init__(
7
+ self,
8
+ device: str = "auto",
9
+ feature_size=80,
10
+ sampling_rate=16000,
11
+ hop_length=160,
12
+ chunk_length=30,
13
+ n_fft=400,
14
+ ):
15
+ if device == "auto":
16
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
17
+ else:
18
+ self.device = device
19
+ self.n_fft = n_fft
20
+ self.hop_length = hop_length
21
+ self.chunk_length = chunk_length
22
+ self.n_samples = chunk_length * sampling_rate
23
+ self.nb_max_frames = self.n_samples // hop_length
24
+ self.time_per_frame = hop_length / sampling_rate
25
+ self.sampling_rate = sampling_rate
26
+ self.mel_filters = self.get_mel_filters(
27
+ sampling_rate, n_fft, n_mels=feature_size
28
+ )
29
+
30
+ @staticmethod
31
+ def get_mel_filters(sr, n_fft, n_mels=128):
32
+ """
33
+ Implementation of librosa.filters.mel in Pytorch
34
+ """
35
+ # Initialize the weights
36
+ n_mels = int(n_mels)
37
+
38
+ # Center freqs of each FFT bin
39
+ fftfreqs = torch.fft.rfftfreq(n=n_fft, d=1.0 / sr)
40
+
41
+ # 'Center freqs' of mel bands - uniformly spaced between limits
42
+ min_mel = 0.0
43
+ max_mel = 45.245640471924965
44
+
45
+ mels = torch.linspace(min_mel, max_mel, n_mels + 2)
46
+
47
+ # Fill in the linear scale
48
+ f_min = 0.0
49
+ f_sp = 200.0 / 3
50
+ freqs = f_min + f_sp * mels
51
+
52
+ # And now the nonlinear scale
53
+ min_log_hz = 1000.0 # beginning of log region (Hz)
54
+ min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
55
+ logstep = torch.log(torch.tensor(6.4)) / 27.0 # step size for log region
56
+
57
+ # If we have vector data, vectorize
58
+ log_t = mels >= min_log_mel
59
+ freqs[log_t] = min_log_hz * torch.exp(logstep * (mels[log_t] - min_log_mel))
60
+
61
+ mel_f = freqs
62
+
63
+ fdiff = torch.diff(mel_f)
64
+ ramps = mel_f.view(-1, 1) - fftfreqs.view(1, -1)
65
+
66
+ lower = -ramps[:-2] / fdiff[:-1].unsqueeze(1)
67
+ upper = ramps[2:] / fdiff[1:].unsqueeze(1)
68
+
69
+ # Intersect them with each other and zero, vectorized across all i
70
+ weights = torch.maximum(torch.zeros_like(lower), torch.minimum(lower, upper))
71
+
72
+ # Slaney-style mel is scaled to be approx constant energy per channel
73
+ enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
74
+ weights *= enorm.unsqueeze(1)
75
+
76
+ return weights
77
+
78
+ def __call__(self, waveform, padding=True, chunk_length=None, to_cpu=False):
79
+ """
80
+ Compute the log-Mel spectrogram of the provided audio.
81
+ """
82
+
83
+ if chunk_length is not None:
84
+ self.n_samples = chunk_length * self.sampling_rate
85
+ self.nb_max_frames = self.n_samples // self.hop_length
86
+
87
+ if waveform.dtype is not torch.float32:
88
+ waveform = waveform.to(torch.float32)
89
+
90
+ waveform = (
91
+ waveform.to(self.device)
92
+ if self.device == "cuda" and not waveform.is_cuda
93
+ else waveform
94
+ )
95
+
96
+ if padding:
97
+ waveform = torch.nn.functional.pad(waveform, (0, self.n_samples))
98
+
99
+ window = torch.hann_window(self.n_fft).to(waveform.device)
100
+
101
+ stft = torch.stft(
102
+ waveform, self.n_fft, self.hop_length, window=window, return_complex=True
103
+ )
104
+ magnitudes = stft[..., :-1].abs() ** 2
105
+
106
+ mel_spec = self.mel_filters.to(waveform.device) @ magnitudes
107
+
108
+ log_spec = torch.clamp(mel_spec, min=1e-10).log10()
109
+ log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
110
+ log_spec = (log_spec + 4.0) / 4.0
111
+
112
+ # When the model is running on multiple GPUs, the output should be moved
113
+ # to the CPU since we don't know which GPU will handle the next job.
114
+ return log_spec.cpu() if to_cpu else log_spec