import spaces import gradio as gr import subprocess import sys import os @spaces.GPU def transcribe(audio_file): try: # Load audio file waveform, sample_rate = torchaudio.load(audio_file) # Move waveform to the correct device waveform = waveform.to(device) # Get the duration of the audio duration = waveform.shape[1] / sample_rate # Check if the audio is too short or too long if duration < MIN_LENGTH or duration > MAX_LENGTH: return f"Audio duration is too short or too long. Duration: {duration} seconds" # Resample if necessary if sample_rate != 16000: resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(device) waveform = resampler(waveform) # Convert to mono if stereo if waveform.shape[0] > 1: waveform = waveform.mean(dim=0, keepdim=True) # Move to CPU for numpy conversion waveform = waveform.cpu() audio_input = waveform.squeeze().numpy() # Ensure audio input is float32 if audio_input.dtype != np.float32: audio_input = audio_input.astype(np.float32) # Process audio input input_values = processor( audio_input, sampling_rate=16_000, return_tensors="pt" ).input_values.to(device) # Convert to float16 if using CUDA if torch_dtype == torch.float16: input_values = input_values.half() # Generate transcription with torch.no_grad(): logits = model(input_values).logits # Use language model for decoding transcription = processor.decode(logits[0].cpu().numpy()) # Return the transcription in lowercase print(transcription) return transcription[0].lower() except Exception as e: return f"Error during transcription: {str(e)}" # Create Gradio interface css = """ .textbox1 textarea { font-size: 18px !important; font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important; line-height: 1.8 !important; } .textbox2 textarea { display: none; } """ demo = gr.Blocks(css=css) tab_audio = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio"), ], outputs=gr.Textbox(label="Transcription", rtl=True, elem_classes="textbox1"), title="Transcribe Dhivehi Audio", allow_flagging="never", ) with demo: gr.TabbedInterface([tab_audio], ["Audio"]) def install_requirements(): requirements_path = 'requirements.txt' # Check if requirements.txt exists if not os.path.exists(requirements_path): print("Error: requirements.txt not found") return False try: print("Installing requirements...") # Using --no-cache-dir to avoid memory issues subprocess.check_call([ sys.executable, "-m", "pip", "install", "-r", requirements_path, "--no-cache-dir" ]) print("Successfully installed all requirements") return True except subprocess.CalledProcessError as e: print(f"Error installing requirements: {e}") return False except Exception as e: print(f"Unexpected error: {e}") return False # Launch the interface if __name__ == "__main__": success = install_requirements() if success: print("All requirements installed successfully") from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM import torch import torchaudio import numpy as np # Device and dtype configuration device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 MODEL_NAME = "alakxender/wav2vec2-large-mms-1b-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-cv" # Trained on Common Voice Data (Unknown Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-small-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-small-cv" # Trained on Common Voice Data (Unknown Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-cv" # Trained on Common Voice Data (Unknown Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-cv" # Trained on Common Voice Data (Unknown Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-calls-md" # Trained on phone calls (65 Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-calls-md" # Trained on phone calls (65 Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-calls-md" # Trained on phone calls (23 Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-dv-syn-md" # Trained on 100% Synthetic Data (80 Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc" # Trained on multiple datasets (350+ Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc-v2" # Trained on multiple datasets (350+ Hours) # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-whisper-v3" # Trained on multiple datasets (350+ Hours) # Load model and processor with LM processor = Wav2Vec2ProcessorWithLM.from_pretrained(MODEL_NAME) model = Wav2Vec2ForCTC.from_pretrained( MODEL_NAME, torch_dtype=torch_dtype ).to(device) MAX_LENGTH = 120 # 2 minutes MIN_LENGTH = 1 # 1 second demo.launch() else: print("Failed to install some requirements")