Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,307 Bytes
d2a74d8 0c6a355 d2a74d8 6430b7c 42657a1 6430b7c 700fda9 6430b7c 700fda9 6430b7c 700fda9 d2a74d8 6430b7c d2a74d8 700fda9 cb3f68b dee3b49 d2a74d8 dee3b49 d2a74d8 dee3b49 d2a74d8 700fda9 d2a74d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 |
import spaces
import gradio as gr
import subprocess
import sys
import os
@spaces.GPU
def transcribe(audio_file):
try:
# Load audio file
waveform, sample_rate = torchaudio.load(audio_file)
# Move waveform to the correct device
waveform = waveform.to(device)
# Get the duration of the audio
duration = waveform.shape[1] / sample_rate
# Check if the audio is too short or too long
if duration < MIN_LENGTH or duration > MAX_LENGTH:
return f"Audio duration is too short or too long. Duration: {duration} seconds"
# Resample if necessary
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(device)
waveform = resampler(waveform)
# Convert to mono if stereo
if waveform.shape[0] > 1:
waveform = waveform.mean(dim=0, keepdim=True)
# Move to CPU for numpy conversion
waveform = waveform.cpu()
audio_input = waveform.squeeze().numpy()
# Ensure audio input is float32
if audio_input.dtype != np.float32:
audio_input = audio_input.astype(np.float32)
# Process audio input
input_values = processor(
audio_input,
sampling_rate=16_000,
return_tensors="pt"
).input_values.to(device)
# Convert to float16 if using CUDA
if torch_dtype == torch.float16:
input_values = input_values.half()
# Generate transcription
with torch.no_grad():
logits = model(input_values).logits
# Use language model for decoding
transcription = processor.decode(logits[0].cpu().numpy())
# Return the transcription in lowercase
print(transcription)
return transcription[0].lower()
except Exception as e:
return f"Error during transcription: {str(e)}"
# Create Gradio interface
css = """
.textbox1 textarea {
font-size: 18px !important;
font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
line-height: 1.8 !important;
}
.textbox2 textarea {
display: none;
}
"""
demo = gr.Blocks(css=css)
tab_audio = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio"),
],
outputs=gr.Textbox(label="Transcription", rtl=True, elem_classes="textbox1"),
title="Transcribe Dhivehi Audio",
allow_flagging="never",
)
with demo:
gr.TabbedInterface([tab_audio], ["Audio"])
def install_requirements():
requirements_path = 'requirements.txt'
# Check if requirements.txt exists
if not os.path.exists(requirements_path):
print("Error: requirements.txt not found")
return False
try:
print("Installing requirements...")
# Using --no-cache-dir to avoid memory issues
subprocess.check_call([
sys.executable,
"-m",
"pip",
"install",
"-r",
requirements_path,
"--no-cache-dir"
])
print("Successfully installed all requirements")
return True
except subprocess.CalledProcessError as e:
print(f"Error installing requirements: {e}")
return False
except Exception as e:
print(f"Unexpected error: {e}")
return False
# Launch the interface
if __name__ == "__main__":
success = install_requirements()
if success:
print("All requirements installed successfully")
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
import torch
import torchaudio
import numpy as np
# Device and dtype configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
MODEL_NAME = "alakxender/wav2vec2-large-mms-1b-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-cv" # Trained on Common Voice Data (Unknown Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-small-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-small-cv" # Trained on Common Voice Data (Unknown Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-cv" # Trained on Common Voice Data (Unknown Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-cv" # Trained on Common Voice Data (Unknown Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-calls-md" # Trained on phone calls (65 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-calls-md" # Trained on phone calls (65 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-calls-md" # Trained on phone calls (23 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-dv-syn-md" # Trained on 100% Synthetic Data (80 Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc" # Trained on multiple datasets (350+ Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc-v2" # Trained on multiple datasets (350+ Hours)
# MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-whisper-v3" # Trained on multiple datasets (350+ Hours)
# Load model and processor with LM
processor = Wav2Vec2ProcessorWithLM.from_pretrained(MODEL_NAME)
model = Wav2Vec2ForCTC.from_pretrained(
MODEL_NAME,
torch_dtype=torch_dtype
).to(device)
MAX_LENGTH = 120 # 2 minutes
MIN_LENGTH = 1 # 1 second
demo.launch()
else:
print("Failed to install some requirements") |