File size: 1,233 Bytes
13f6d73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from modules.models.whisper_online import *

class SpeechRecognizer():
	def __init__(self, language: str, model: str) -> None:
		self.transcribtion_buffer = []
		self.src_lan = language
		asr = FasterWhisperASR(self.src_lan, model)
		asr.use_vad()
		tokenizer = MosesTokeniserWrapper()
		self.processor = VACOnlineASRProcessor(online_chunk_size=1, asr=asr, tokenizer=tokenizer, buffer_trimming=("segment", 15))

	def append_audio(self, audio: bytes):
		if (len(audio) < 160):
			raise Exception("Chunk is too small (need to be at least 160 bytes)")

		self.processor.insert_audio_chunk(self.audio_to_tensor(audio))

	def buffer_size(self):
		return self.processor.online_chunk_size

	def process_buffer(self):
		return self.processor.process_iter()
	
	def flush(self):
		return self.processor.finish()
	
	def clear_buffer(self):
		self.transcribtion_buffer.clear()
	
	def get_status(self):
		return self.processor.status

	def audio_to_tensor(self, audio_frame: bytes):
		buffer = np.frombuffer(audio_frame, dtype=np.int16).astype(np.float32) / 32767.0
		return torch.from_numpy(buffer)

import re
class MosesTokeniserWrapper:
	def __init__(self):
		pass

	def split(self, text):
		rx = r"[^()\s]+|[()]"
		return re.findall(rx, text)