alakxender's picture
u
d2a74d8
raw
history blame
4.13 kB
import spaces
import gradio as gr
import subprocess
import sys
import os
def transcribe(audio_file):
try:
# Load audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Move waveform to the correct device
waveform = waveform.to(device)
# Get the duration of the audio
duration = waveform.shape[1] / sample_rate
# Check if the audio is too short or too long
if duration < MIN_LENGTH or duration > MAX_LENGTH:
return f"Audio duration is too short or too long. Duration: {duration} seconds"
# Resample if necessary
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(device)
waveform = resampler(waveform)
# Convert to mono if stereo
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Move to CPU for numpy conversion
waveform = waveform.cpu()
audio_input = waveform.squeeze().numpy()
# Ensure audio input is float32
if audio_input.dtype != np.float32:
audio_input = audio_input.astype(np.float32)
# Process audio input
input_values = processor(
audio_input,
sampling_rate=16_000,
return_tensors="pt"
).input_values.to(device)
# Convert to float16 if using CUDA
if torch_dtype == torch.float16:
input_values = input_values.half()
# Generate transcription
with torch.no_grad():
logits = model(input_values).logits
# Use language model for decoding
transcription = processor.decode(logits[0].cpu().numpy())
# Return the transcription in lowercase
print(transcription)
return transcription[0].lower()
except Exception as e:
return f"Error during transcription: {str(e)}"
# Create Gradio interface
iface = gr.Interface(
fn=transcribe,
inputs=gr.Audio(type="filepath"),
outputs="text",
title="Dhivehi Speech Recognition with Language Model",
description="Upload an audio file to transcribe Dhivehi speech to text using language model enhanced decoding."
)
def install_requirements():
requirements_path = 'requirements.txt'
# Check if requirements.txt exists
if not os.path.exists(requirements_path):
print("Error: requirements.txt not found")
return False
try:
print("Installing requirements...")
# Using --no-cache-dir to avoid memory issues
subprocess.check_call([
sys.executable,
"-m",
"pip",
"install",
"-r",
requirements_path,
"--no-cache-dir"
])
print("Successfully installed all requirements")
return True
except subprocess.CalledProcessError as e:
print(f"Error installing requirements: {e}")
return False
except Exception as e:
print(f"Unexpected error: {e}")
return False
# Launch the interface
if __name__ == "__main__":
success = install_requirements()
if success:
print("All requirements installed successfully")
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
import torch
import torchaudio
import numpy as np
# Device and dtype configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
# Load model and processor with LM
processor = Wav2Vec2ProcessorWithLM.from_pretrained("alakxender/wav2vec2-large-mms-1b-dv-syn-md")
model = Wav2Vec2ForCTC.from_pretrained(
"alakxender/wav2vec2-large-mms-1b-dv-syn-md",
torch_dtype=torch_dtype
).to(device)
MAX_LENGTH = 120 # 2 minutes
MIN_LENGTH = 1 # 1 second
iface.launch()
else:
print("Failed to install some requirements")