Spaces:

sinabis
/

transcription_service

Sleeping

App Files Files Community

transcription_service / src /runners /audio_transcriber.py

aleger

add bentoml files

d8d26b1 almost 2 years ago

raw

history blame contribute delete

2.81 kB

	import tempfile

	import bentoml
	import ffmpeg
	import numpy as np
	import torch
	from faster_whisper import WhisperModel
	from transformers import pipeline


	class AudioTranscriber(bentoml.Runnable):
	SUPPORTED_RESOURCES = ("nvidia.com/gpu", "cpu")
	SUPPORTS_CPU_MULTI_THREADING = True

	SAMPLE_RATE = 16000

	def __init__(self):
	self.faster_model = WhisperModel("base")
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	# self.model.to(self.device)

	@bentoml.Runnable.method(batchable=False)
	def transcribe_audio_faster(self, temp_file_path):
	segments, info = self.faster_model.transcribe(temp_file_path)
	transcription = []
	segment_info = []
	for segment in segments:
	print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))
	transcription.append(segment.text)
	segment_info.append((segment.start, segment.end))

	return transcription, info, segment_info

	@bentoml.Runnable.method(batchable=False)
	def transcribe_audio(self, file):
	with tempfile.NamedTemporaryFile(delete=False) as temp_file:
	temp_file.write(file.read())
	temp_file_path = temp_file.name

	# Use the temporary file path as input for ffmpeg.input()

	try:
	# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
	# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
	out, _ = (
	ffmpeg.input(temp_file_path, threads=0)
	.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=self.SAMPLE_RATE)
	.run(cmd="ffmpeg", capture_stdout=True, capture_stderr=True)
	)
	except ffmpeg.Error as e:
	raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}")

	input_features = np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

	print("Input_features", type(input_features))

	pipe = pipeline("automatic-speech-recognition",
	model=self.model, tokenizer=self.tokenizer, feature_extractor=self.extractor,
	device=self.device)

	result = self.get_long_transcription_whisper(input_features, pipe)

	return result

	@staticmethod
	def get_long_transcription_whisper(input_features, pipe, return_timestamps=True,
	chunk_length_s=10, stride_length_s=2):
	"""Get the transcription of a long audio file using the Whisper model
	input_feature: numpy.ndarray
	"""

	return pipe(input_features, return_timestamps=return_timestamps,
	chunk_length_s=chunk_length_s, stride_length_s=stride_length_s)