|
from modules.models.whisper_online import * |
|
|
|
class SpeechRecognizer(): |
|
def __init__(self, language: str, model: str) -> None: |
|
self.transcribtion_buffer = [] |
|
self.src_lan = language |
|
asr = FasterWhisperASR(self.src_lan, model) |
|
asr.use_vad() |
|
tokenizer = MosesTokeniserWrapper() |
|
self.processor = VACOnlineASRProcessor(online_chunk_size=1, asr=asr, tokenizer=tokenizer, buffer_trimming=("segment", 15)) |
|
|
|
def append_audio(self, audio: bytes): |
|
if (len(audio) < 160): |
|
raise Exception("Chunk is too small (need to be at least 160 bytes)") |
|
|
|
self.processor.insert_audio_chunk(self.audio_to_tensor(audio)) |
|
|
|
def buffer_size(self): |
|
return self.processor.online_chunk_size |
|
|
|
def process_buffer(self): |
|
return self.processor.process_iter() |
|
|
|
def flush(self): |
|
return self.processor.finish() |
|
|
|
def clear_buffer(self): |
|
self.transcribtion_buffer.clear() |
|
|
|
def get_status(self): |
|
return self.processor.status |
|
|
|
def audio_to_tensor(self, audio_frame: bytes): |
|
buffer = np.frombuffer(audio_frame, dtype=np.int16).astype(np.float32) / 32767.0 |
|
return torch.from_numpy(buffer) |
|
|
|
import re |
|
class MosesTokeniserWrapper: |
|
def __init__(self): |
|
pass |
|
|
|
def split(self, text): |
|
rx = r"[^()\s]+|[()]" |
|
return re.findall(rx, text) |