Spaces:

alakxender
/

dhivehi-mms-demo

Running on Zero

dhivehi-mms-demo / app.py

d2a74d8 5 months ago

4.13 kB

	import spaces
	import gradio as gr
	import subprocess
	import sys
	import os

	def transcribe(audio_file):
	try:
	# Load audio file
	waveform, sample_rate = torchaudio.load(audio_file)

	# Move waveform to the correct device
	waveform = waveform.to(device)

	# Get the duration of the audio
	duration = waveform.shape[1] / sample_rate

	# Check if the audio is too short or too long
	if duration < MIN_LENGTH or duration > MAX_LENGTH:
	return f"Audio duration is too short or too long. Duration: {duration} seconds"

	# Resample if necessary
	if sample_rate != 16000:
	resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(device)
	waveform = resampler(waveform)

	# Convert to mono if stereo
	if waveform.shape[0] > 1:
	waveform = waveform.mean(dim=0, keepdim=True)

	# Move to CPU for numpy conversion
	waveform = waveform.cpu()
	audio_input = waveform.squeeze().numpy()

	# Ensure audio input is float32
	if audio_input.dtype != np.float32:
	audio_input = audio_input.astype(np.float32)

	# Process audio input
	input_values = processor(
	audio_input,
	sampling_rate=16_000,
	return_tensors="pt"
	).input_values.to(device)

	# Convert to float16 if using CUDA
	if torch_dtype == torch.float16:
	input_values = input_values.half()

	# Generate transcription
	with torch.no_grad():
	logits = model(input_values).logits

	# Use language model for decoding
	transcription = processor.decode(logits[0].cpu().numpy())

	# Return the transcription in lowercase
	print(transcription)
	return transcription[0].lower()

	except Exception as e:
	return f"Error during transcription: {str(e)}"

	# Create Gradio interface
	iface = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(type="filepath"),
	outputs="text",
	title="Dhivehi Speech Recognition with Language Model",
	description="Upload an audio file to transcribe Dhivehi speech to text using language model enhanced decoding."
	)


	def install_requirements():
	requirements_path = 'requirements.txt'

	# Check if requirements.txt exists
	if not os.path.exists(requirements_path):
	print("Error: requirements.txt not found")
	return False

	try:
	print("Installing requirements...")
	# Using --no-cache-dir to avoid memory issues
	subprocess.check_call([
	sys.executable,
	"-m",
	"pip",
	"install",
	"-r",
	requirements_path,
	"--no-cache-dir"
	])
	print("Successfully installed all requirements")
	return True
	except subprocess.CalledProcessError as e:
	print(f"Error installing requirements: {e}")
	return False
	except Exception as e:
	print(f"Unexpected error: {e}")
	return False

	# Launch the interface
	if __name__ == "__main__":
	success = install_requirements()
	if success:
	print("All requirements installed successfully")

	from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
	import torch
	import torchaudio
	import numpy as np

	# Device and dtype configuration
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Load model and processor with LM
	processor = Wav2Vec2ProcessorWithLM.from_pretrained("alakxender/wav2vec2-large-mms-1b-dv-syn-md")
	model = Wav2Vec2ForCTC.from_pretrained(
	"alakxender/wav2vec2-large-mms-1b-dv-syn-md",
	torch_dtype=torch_dtype
	).to(device)

	MAX_LENGTH = 120 # 2 minutes
	MIN_LENGTH = 1 # 1 second

	iface.launch()
	else:
	print("Failed to install some requirements")