Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import subprocess | |
import sys | |
import os | |
def transcribe(audio_file): | |
try: | |
# Load audio file | |
waveform, sample_rate = torchaudio.load(audio_file) | |
# Move waveform to the correct device | |
waveform = waveform.to(device) | |
# Get the duration of the audio | |
duration = waveform.shape[1] / sample_rate | |
# Check if the audio is too short or too long | |
if duration < MIN_LENGTH or duration > MAX_LENGTH: | |
return f"Audio duration is too short or too long. Duration: {duration} seconds" | |
# Resample if necessary | |
if sample_rate != 16000: | |
resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(device) | |
waveform = resampler(waveform) | |
# Convert to mono if stereo | |
if waveform.shape[0] > 1: | |
waveform = waveform.mean(dim=0, keepdim=True) | |
# Move to CPU for numpy conversion | |
waveform = waveform.cpu() | |
audio_input = waveform.squeeze().numpy() | |
# Ensure audio input is float32 | |
if audio_input.dtype != np.float32: | |
audio_input = audio_input.astype(np.float32) | |
# Process audio input | |
input_values = processor( | |
audio_input, | |
sampling_rate=16_000, | |
return_tensors="pt" | |
).input_values.to(device) | |
# Convert to float16 if using CUDA | |
if torch_dtype == torch.float16: | |
input_values = input_values.half() | |
# Generate transcription | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
# Use language model for decoding | |
transcription = processor.decode(logits[0].cpu().numpy()) | |
# Return the transcription in lowercase | |
print(transcription) | |
return transcription[0].lower() | |
except Exception as e: | |
return f"Error during transcription: {str(e)}" | |
# Create Gradio interface | |
css = """ | |
.textbox1 textarea { | |
font-size: 18px !important; | |
font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important; | |
line-height: 1.8 !important; | |
} | |
.textbox2 textarea { | |
display: none; | |
} | |
""" | |
demo = gr.Blocks(css=css) | |
tab_audio = gr.Interface( | |
fn=transcribe, | |
inputs=[ | |
gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio"), | |
], | |
outputs=gr.Textbox(label="Transcription", rtl=True, elem_classes="textbox1"), | |
title="Transcribe Dhivehi Audio", | |
allow_flagging="never", | |
) | |
with demo: | |
gr.TabbedInterface([tab_audio], ["Audio"]) | |
def install_requirements(): | |
requirements_path = 'requirements.txt' | |
# Check if requirements.txt exists | |
if not os.path.exists(requirements_path): | |
print("Error: requirements.txt not found") | |
return False | |
try: | |
print("Installing requirements...") | |
# Using --no-cache-dir to avoid memory issues | |
subprocess.check_call([ | |
sys.executable, | |
"-m", | |
"pip", | |
"install", | |
"-r", | |
requirements_path, | |
"--no-cache-dir" | |
]) | |
print("Successfully installed all requirements") | |
return True | |
except subprocess.CalledProcessError as e: | |
print(f"Error installing requirements: {e}") | |
return False | |
except Exception as e: | |
print(f"Unexpected error: {e}") | |
return False | |
# Launch the interface | |
if __name__ == "__main__": | |
success = install_requirements() | |
if success: | |
print("All requirements installed successfully") | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM | |
import torch | |
import torchaudio | |
import numpy as np | |
# Device and dtype configuration | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 | |
MODEL_NAME = "alakxender/wav2vec2-large-mms-1b-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-cv" # Trained on Common Voice Data (Unknown Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-small-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-small-cv" # Trained on Common Voice Data (Unknown Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-cv" # Trained on Common Voice Data (Unknown Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-cv" # Trained on Common Voice Data (Unknown Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-calls-md" # Trained on phone calls (65 Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-calls-md" # Trained on phone calls (65 Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-calls-md" # Trained on phone calls (23 Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-dv-syn-md" # Trained on 100% Synthetic Data (80 Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc" # Trained on multiple datasets (350+ Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc-v2" # Trained on multiple datasets (350+ Hours) | |
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-whisper-v3" # Trained on multiple datasets (350+ Hours) | |
# Load model and processor with LM | |
processor = Wav2Vec2ProcessorWithLM.from_pretrained(MODEL_NAME) | |
model = Wav2Vec2ForCTC.from_pretrained( | |
MODEL_NAME, | |
torch_dtype=torch_dtype | |
).to(device) | |
MAX_LENGTH = 120 # 2 minutes | |
MIN_LENGTH = 1 # 1 second | |
demo.launch() | |
else: | |
print("Failed to install some requirements") |