|
import base64 |
|
import io |
|
import traceback |
|
|
|
import numpy as np |
|
from pydub import AudioSegment |
|
|
|
from utils import get_device |
|
|
|
|
|
def get_transcriber(model="openai/whisper-base.en", use_gpu=True, gpu_id='auto'): |
|
if gpu_id == 'auto': |
|
gpu_id = 0 |
|
device = get_device() |
|
if device == 'cpu' or not use_gpu: |
|
device_map = 'auto' |
|
else: |
|
device_map = {"": gpu_id} if gpu_id >= 0 else {'': 'cuda'} |
|
|
|
from transformers import pipeline |
|
transcriber = pipeline("automatic-speech-recognition", model=model, device_map=device_map) |
|
return transcriber |
|
|
|
|
|
def audio_bytes_to_numpy(audio_bytes): |
|
|
|
audio_stream = io.BytesIO(audio_bytes) |
|
|
|
|
|
audio = AudioSegment.from_file(audio_stream) |
|
|
|
|
|
samples = np.array(audio.get_array_of_samples()) |
|
|
|
|
|
sr = audio.frame_rate |
|
|
|
|
|
if audio.channels > 1: |
|
samples = samples.reshape((-1, audio.channels)) |
|
|
|
return sr, samples |
|
|
|
|
|
def transcribe(audio_state1, new_chunk, transcriber=None, max_chunks=None, sst_floor=100.0, reject_no_new_text=True, |
|
debug=False): |
|
if debug: |
|
print("start transcribe", flush=True) |
|
|
|
if audio_state1[0] is None: |
|
audio_state1[0] = '' |
|
if audio_state1[2] is None: |
|
audio_state1[2] = [] |
|
if max_chunks is not None and audio_state1[2] is not None and len(audio_state1[2]) > max_chunks: |
|
|
|
return audio_state1, audio_state1[1] |
|
if audio_state1[3] == 'off': |
|
if debug: |
|
print("Already ended", flush=True) |
|
return audio_state1, audio_state1[1] |
|
|
|
|
|
if isinstance(new_chunk, str): |
|
audio_bytes = base64.b64decode(new_chunk.encode('utf-8')) |
|
sr, y = audio_bytes_to_numpy(audio_bytes) |
|
else: |
|
sr, y = new_chunk |
|
|
|
if debug: |
|
print("post encode", flush=True) |
|
|
|
if y.shape[0] == 0: |
|
avg = 0.0 |
|
else: |
|
|
|
if len(y.shape) > 1: |
|
if y.shape[0] == 2: |
|
y = np.mean(y, axis=0) |
|
else: |
|
y = np.mean(y, axis=1) |
|
avg = np.average(np.abs(y)) |
|
if not np.isfinite(avg): |
|
avg = 0.0 |
|
if avg > sst_floor: |
|
if debug: |
|
print("Got possible chunk: %s" % avg, flush=True) |
|
chunks_new = audio_state1[2] + [y] |
|
else: |
|
chunks_new = audio_state1[2] |
|
if debug: |
|
print("Rejected quiet chunk: %s" % avg, flush=True) |
|
if chunks_new: |
|
stream = np.concatenate(chunks_new) |
|
stream = stream.astype(np.float32) |
|
max_stream = np.max(np.abs(stream) + 1E-7) |
|
stream /= max_stream |
|
if debug: |
|
print("pre transcriber", flush=True) |
|
text = transcriber({"sampling_rate": sr, "raw": stream})["text"] |
|
if debug: |
|
print("post transcriber", flush=True) |
|
|
|
if audio_state1[2]: |
|
try: |
|
stream0 = np.concatenate(audio_state1[2]) |
|
except Exception as e: |
|
print("Exception: %s %s" % (str(e), traceback.format_exc()), flush=True) |
|
raise |
|
stream0 = stream0.astype(np.float32) |
|
max_stream0 = np.max(np.abs(stream0) + 1E-7) |
|
stream0 /= max_stream0 |
|
if debug: |
|
print("pre stranscriber", flush=True) |
|
text_y = transcriber({"sampling_rate": sr, "raw": stream0})["text"] |
|
if debug: |
|
print("post stranscriber", flush=True) |
|
else: |
|
text_y = None |
|
|
|
if debug: |
|
print("y.shape: %s stream.shape: %s text0=%s text=%s text_y=%s" % ( |
|
str(y.shape), str(stream.shape), audio_state1[0], text, text_y)) |
|
if reject_no_new_text and (text == text_y): |
|
if debug: |
|
print("Rejected non-textual chunk: %s" % avg, flush=True) |
|
|
|
|
|
else: |
|
audio_state1[2] = chunks_new |
|
else: |
|
text = '' |
|
|
|
|
|
|
|
if audio_state1[0] == text: |
|
|
|
text = '' |
|
|
|
if audio_state1[0] is not None: |
|
|
|
audio_state1[1] = audio_state1[0] + text |
|
return audio_state1, audio_state1[1] |
|
|