from modules.models.whisper_online import * class SpeechRecognizer(): def __init__(self, language: str, model: str) -> None: self.transcribtion_buffer = [] self.src_lan = language asr = FasterWhisperASR(self.src_lan, model) asr.use_vad() tokenizer = MosesTokeniserWrapper() self.processor = VACOnlineASRProcessor(online_chunk_size=1, asr=asr, tokenizer=tokenizer, buffer_trimming=("segment", 15)) def append_audio(self, audio: bytes): if (len(audio) < 160): raise Exception("Chunk is too small (need to be at least 160 bytes)") self.processor.insert_audio_chunk(self.audio_to_tensor(audio)) def buffer_size(self): return self.processor.online_chunk_size def process_buffer(self): return self.processor.process_iter() def flush(self): return self.processor.finish() def clear_buffer(self): self.transcribtion_buffer.clear() def get_status(self): return self.processor.status def audio_to_tensor(self, audio_frame: bytes): buffer = np.frombuffer(audio_frame, dtype=np.int16).astype(np.float32) / 32767.0 return torch.from_numpy(buffer) import re class MosesTokeniserWrapper: def __init__(self): pass def split(self, text): rx = r"[^()\s]+|[()]" return re.findall(rx, text)