alakxender's picture
ptlocal
cb3f68b
import spaces
import gradio as gr
import subprocess
import sys
import os
@spaces.GPU
def transcribe(audio_file):
try:
# Load audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Move waveform to the correct device
waveform = waveform.to(device)
# Get the duration of the audio
duration = waveform.shape[1] / sample_rate
# Check if the audio is too short or too long
if duration < MIN_LENGTH or duration > MAX_LENGTH:
return f"Audio duration is too short or too long. Duration: {duration} seconds"
# Resample if necessary
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(device)
waveform = resampler(waveform)
# Convert to mono if stereo
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Move to CPU for numpy conversion
waveform = waveform.cpu()
audio_input = waveform.squeeze().numpy()
# Ensure audio input is float32
if audio_input.dtype != np.float32:
audio_input = audio_input.astype(np.float32)
# Process audio input
input_values = processor(
audio_input,
sampling_rate=16_000,
return_tensors="pt"
).input_values.to(device)
# Convert to float16 if using CUDA
if torch_dtype == torch.float16:
input_values = input_values.half()
# Generate transcription
with torch.no_grad():
logits = model(input_values).logits
# Use language model for decoding
transcription = processor.decode(logits[0].cpu().numpy())
# Return the transcription in lowercase
print(transcription)
return transcription[0].lower()
except Exception as e:
return f"Error during transcription: {str(e)}"
# Create Gradio interface
css = """
.textbox1 textarea {
font-size: 18px !important;
font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
line-height: 1.8 !important;
}
.textbox2 textarea {
display: none;
}
"""
demo = gr.Blocks(css=css)
tab_audio = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio"),
],
outputs=gr.Textbox(label="Transcription", rtl=True, elem_classes="textbox1"),
title="Transcribe Dhivehi Audio",
allow_flagging="never",
)
with demo:
gr.TabbedInterface([tab_audio], ["Audio"])
def install_requirements():
requirements_path = 'requirements.txt'
# Check if requirements.txt exists
if not os.path.exists(requirements_path):
print("Error: requirements.txt not found")
return False
try:
print("Installing requirements...")
# Using --no-cache-dir to avoid memory issues
subprocess.check_call([
sys.executable,
"-m",
"pip",
"install",
"-r",
requirements_path,
"--no-cache-dir"
])
print("Successfully installed all requirements")
return True
except subprocess.CalledProcessError as e:
print(f"Error installing requirements: {e}")
return False
except Exception as e:
print(f"Unexpected error: {e}")
return False
# Launch the interface
if __name__ == "__main__":
success = install_requirements()
if success:
print("All requirements installed successfully")
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
import torch
import torchaudio
import numpy as np
# Device and dtype configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
MODEL_NAME = "alakxender/wav2vec2-large-mms-1b-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-cv" # Trained on Common Voice Data (Unknown Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-small-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-small-cv" # Trained on Common Voice Data (Unknown Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-cv" # Trained on Common Voice Data (Unknown Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-cv" # Trained on Common Voice Data (Unknown Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-calls-md" # Trained on phone calls (65 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-calls-md" # Trained on phone calls (65 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-calls-md" # Trained on phone calls (23 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-dv-syn-md" # Trained on 100% Synthetic Data (80 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc" # Trained on multiple datasets (350+ Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc-v2" # Trained on multiple datasets (350+ Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-whisper-v3" # Trained on multiple datasets (350+ Hours)
# Load model and processor with LM
processor = Wav2Vec2ProcessorWithLM.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(
MODEL_NAME,
torch_dtype=torch_dtype
).to(device)
MAX_LENGTH = 120 # 2 minutes
MIN_LENGTH = 1 # 1 second
demo.launch()
else:
print("Failed to install some requirements")