tempretaure removced, added multilingual
Browse files- .gitignore +2 -1
- static/client.html +16 -4
- unified_socket_server.py +10 -4
.gitignore
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
*.ipynb
|
2 |
old_demo_code/
|
3 |
htokenf.txt
|
4 |
-
__pycache__/
|
|
|
|
1 |
*.ipynb
|
2 |
old_demo_code/
|
3 |
htokenf.txt
|
4 |
+
__pycache__/
|
5 |
+
.vscode/
|
static/client.html
CHANGED
@@ -238,7 +238,13 @@
|
|
238 |
|
239 |
<div class="config-group">
|
240 |
<label for="chunkSize">Chunk Size (ms):</label>
|
241 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
242 |
</div>
|
243 |
|
244 |
<div class="config-group">
|
@@ -247,8 +253,14 @@
|
|
247 |
</div>
|
248 |
|
249 |
<div class="config-group">
|
250 |
-
<label for="
|
251 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
252 |
</div>
|
253 |
</div>
|
254 |
|
@@ -357,7 +369,7 @@
|
|
357 |
model_size: document.getElementById('modelSize').value,
|
358 |
chunk_size: parseInt(document.getElementById('chunkSize').value),
|
359 |
beam_size: parseInt(document.getElementById('beamSize').value),
|
360 |
-
|
361 |
};
|
362 |
|
363 |
log('Starting transcription session...');
|
|
|
238 |
|
239 |
<div class="config-group">
|
240 |
<label for="chunkSize">Chunk Size (ms):</label>
|
241 |
+
<select id="chunkSize">
|
242 |
+
<option value="40">40</option>
|
243 |
+
<option value="100" selected>100</option>
|
244 |
+
<option value="200">200</option>
|
245 |
+
<option value="300" selected>300</option>
|
246 |
+
<option value="1000">1000</option>
|
247 |
+
</select>
|
248 |
</div>
|
249 |
|
250 |
<div class="config-group">
|
|
|
253 |
</div>
|
254 |
|
255 |
<div class="config-group">
|
256 |
+
<label for="language">Language:</label>
|
257 |
+
<select id="language">
|
258 |
+
<option value="en" selected>English</option>
|
259 |
+
<option value="fr">French</option>
|
260 |
+
<option value="es">Spanish</option>
|
261 |
+
<option value="de">German</option>
|
262 |
+
<option value="pt">Portuguese</option>
|
263 |
+
</select>
|
264 |
</div>
|
265 |
</div>
|
266 |
|
|
|
369 |
model_size: document.getElementById('modelSize').value,
|
370 |
chunk_size: parseInt(document.getElementById('chunkSize').value),
|
371 |
beam_size: parseInt(document.getElementById('beamSize').value),
|
372 |
+
language: document.getElementById('language').value
|
373 |
};
|
374 |
|
375 |
log('Starting transcription session...');
|
unified_socket_server.py
CHANGED
@@ -126,7 +126,7 @@ class UnifiedTranscriptionServer:
|
|
126 |
logger.info(f"Received config from {client_id}: {config}")
|
127 |
|
128 |
# Validate config
|
129 |
-
required_fields = ['model_size', 'chunk_size', 'beam_size', '
|
130 |
for field in required_fields:
|
131 |
if field not in config:
|
132 |
await ws.send_str(json.dumps({"error": f"Missing required field: {field}"}))
|
@@ -138,9 +138,16 @@ class UnifiedTranscriptionServer:
|
|
138 |
|
139 |
logger.info(f"Loading model {model_size} for client {client_id}")
|
140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
# Try to use whisper_stream, fallback to regular whisper
|
142 |
try:
|
143 |
-
model = load_streaming_model_correct(model_size, chunk_size)
|
144 |
client['first_chunk'] = True
|
145 |
if torch.cuda.is_available():
|
146 |
model = model.to("cuda")
|
@@ -236,12 +243,11 @@ class UnifiedTranscriptionServer:
|
|
236 |
if hasattr(model, 'decode') and 'use_streaming' not in client:
|
237 |
# Using whisper_stream
|
238 |
decoding_options = DecodingOptions(
|
239 |
-
language=
|
240 |
gran=(config['chunk_size'] // 20),
|
241 |
single_frame_mel=True,
|
242 |
without_timestamps=True,
|
243 |
beam_size=config['beam_size'],
|
244 |
-
temperature=config['temperature'],
|
245 |
stream_decode=True,
|
246 |
use_ca_kv_cache=True,
|
247 |
look_ahead_blocks=model.extra_gran_blocks
|
|
|
126 |
logger.info(f"Received config from {client_id}: {config}")
|
127 |
|
128 |
# Validate config
|
129 |
+
required_fields = ['model_size', 'chunk_size', 'beam_size', 'language']
|
130 |
for field in required_fields:
|
131 |
if field not in config:
|
132 |
await ws.send_str(json.dumps({"error": f"Missing required field: {field}"}))
|
|
|
138 |
|
139 |
logger.info(f"Loading model {model_size} for client {client_id}")
|
140 |
|
141 |
+
# Check - if language is other than english, throw an error.
|
142 |
+
# Only large-v2 300msec is available.
|
143 |
+
if multilingual := config['language'] != "en":
|
144 |
+
if model_size != "large-v2" or chunk_size != 300:
|
145 |
+
await ws.send_str(json.dumps({"error": f"Running multilingual transcription is available for now only on large-v2 model using chunk size of 300ms."}))
|
146 |
+
return
|
147 |
+
|
148 |
# Try to use whisper_stream, fallback to regular whisper
|
149 |
try:
|
150 |
+
model = load_streaming_model_correct(model_size, chunk_size, multilingual)
|
151 |
client['first_chunk'] = True
|
152 |
if torch.cuda.is_available():
|
153 |
model = model.to("cuda")
|
|
|
243 |
if hasattr(model, 'decode') and 'use_streaming' not in client:
|
244 |
# Using whisper_stream
|
245 |
decoding_options = DecodingOptions(
|
246 |
+
language=config['language'],
|
247 |
gran=(config['chunk_size'] // 20),
|
248 |
single_frame_mel=True,
|
249 |
without_timestamps=True,
|
250 |
beam_size=config['beam_size'],
|
|
|
251 |
stream_decode=True,
|
252 |
use_ca_kv_cache=True,
|
253 |
look_ahead_blocks=model.extra_gran_blocks
|