STT / modules /models /speech_recognizer.py
goldpulpy's picture
Upload space
13f6d73
from modules.models.whisper_online import *
class SpeechRecognizer():
def __init__(self, language: str, model: str) -> None:
self.transcribtion_buffer = []
self.src_lan = language
asr = FasterWhisperASR(self.src_lan, model)
asr.use_vad()
tokenizer = MosesTokeniserWrapper()
self.processor = VACOnlineASRProcessor(online_chunk_size=1, asr=asr, tokenizer=tokenizer, buffer_trimming=("segment", 15))
def append_audio(self, audio: bytes):
if (len(audio) < 160):
raise Exception("Chunk is too small (need to be at least 160 bytes)")
self.processor.insert_audio_chunk(self.audio_to_tensor(audio))
def buffer_size(self):
return self.processor.online_chunk_size
def process_buffer(self):
return self.processor.process_iter()
def flush(self):
return self.processor.finish()
def clear_buffer(self):
self.transcribtion_buffer.clear()
def get_status(self):
return self.processor.status
def audio_to_tensor(self, audio_frame: bytes):
buffer = np.frombuffer(audio_frame, dtype=np.int16).astype(np.float32) / 32767.0
return torch.from_numpy(buffer)
import re
class MosesTokeniserWrapper:
def __init__(self):
pass
def split(self, text):
rx = r"[^()\s]+|[()]"
return re.findall(rx, text)