Spaces:

TIMBOVILL
/

UltraSingerUI

Sleeping

App Files Files Community

UltraSingerUI / src /modules /Speech_Recognition /Whisper.py

TIMBOVILL

Upload 3 files

80d8416 verified about 1 year ago

raw

history blame contribute delete

4.07 kB

	"""Whisper Speech Recognition Module"""

	import sys

	import whisperx
	from torch.cuda import OutOfMemoryError

	from modules.console_colors import ULTRASINGER_HEAD, blue_highlighted, red_highlighted
	from modules.Speech_Recognition.TranscribedData import TranscribedData


	def transcribe_with_whisper(
	audio_path: str,
	model: str,
	device="cpu",
	model_name: str = None,
	batch_size: int = 16,
	compute_type: str = None,
	language: str = None,
	) -> (list[TranscribedData], str):
	"""Transcribe with whisper"""

	# Info: Regardless of the audio sampling rate used in the original audio file, whisper resample the audio signal to 16kHz (via ffmpeg). So the standard input from (44.1 or 48 kHz) should work.

	print(
	f"{ULTRASINGER_HEAD} Loading {blue_highlighted('whisper')} with model {blue_highlighted(model)} and {red_highlighted(device)} as worker"
	)
	if model_name is not None:
	print(f"{ULTRASINGER_HEAD} using alignment model {blue_highlighted(model_name)}")

	if compute_type is None:
	compute_type = "float16" if device == "cuda" else "int8"

	try:
	loaded_whisper_model = whisperx.load_model(
	model, language=language, device=device, compute_type=compute_type
	)
	except ValueError as value_error:
	if (
	"Requested float16 compute type, but the target device or backend do not support efficient float16 computation."
	in str(value_error.args[0])
	):
	print(value_error)
	print(
	f"{ULTRASINGER_HEAD} Your GPU does not support efficient float16 computation; run UltraSinger with '--whisper_compute_type int8'"
	)
	sys.exit(1)

	raise value_error
	except OutOfMemoryError as oom_exception:
	print(oom_exception)
	print(
	f"{ULTRASINGER_HEAD} {blue_highlighted('whisper')} ran out of GPU memory; reduce --whisper_batch_size or force usage of cpu with --force_cpu"
	)
	sys.exit(1)

	audio = whisperx.load_audio(audio_path)

	print(f"{ULTRASINGER_HEAD} Transcribing {audio_path}")

	result = loaded_whisper_model.transcribe(
	audio, batch_size=batch_size, language=language
	)

	detected_language = result["language"]
	if language is None:
	language = detected_language

	# load alignment model and metadata
	try:
	model_a, metadata = whisperx.load_align_model(
	language_code=language, device=device, model_name=model_name
	)
	except ValueError as ve:
	print(
	f"{red_highlighted(f'{ve}')}"
	f"\n"
	f"{ULTRASINGER_HEAD} {red_highlighted('Error:')} Unknown language. "
	f"Try add it with --align_model [huggingface]."
	)
	sys.exit(1)

	# align whisper output
	result_aligned = whisperx.align(
	result["segments"],
	model_a,
	metadata,
	audio,
	device,
	return_char_alignments=False,
	)

	transcribed_data = convert_to_transcribed_data(result_aligned)

	return transcribed_data, detected_language


	def convert_to_transcribed_data(result_aligned):
	transcribed_data = []
	for segment in result_aligned["segments"]:
	for obj in segment["words"]:
	vtd = TranscribedData(obj) # create custom Word object
	vtd.word = vtd.word + " " # add space to end of word
	if len(obj) < 4:
	previous = transcribed_data[-1]
	if not previous:
	previous.end = 0
	previous.end = ""
	vtd.start = previous.end + 0.1
	vtd.end = previous.end + 0.2
	msg = f'Error: There is no timestamp for word: "{obj["word"]}". ' \
	f'Fixing it by placing it after the previous word: "{previous.word}". At start: {vtd.start} end: {vtd.end}. Fix it manually!'
	print(f"{red_highlighted(msg)}")
	transcribed_data.append(vtd) # and add it to list
	return transcribed_data