File size: 6,307 Bytes
d2a74d8
0c6a355
d2a74d8
 
 
6430b7c
42657a1
6430b7c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
700fda9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6430b7c
700fda9
 
 
 
 
 
6430b7c
 
700fda9
 
 
d2a74d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6430b7c
 
d2a74d8
 
 
 
 
 
 
 
 
 
 
 
 
700fda9
cb3f68b
 
 
 
 
 
 
 
 
 
 
 
 
 
dee3b49
d2a74d8
dee3b49
d2a74d8
dee3b49
d2a74d8
 
 
 
 
 
700fda9
d2a74d8
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
import spaces
import gradio as gr
import subprocess
import sys
import os

@spaces.GPU
def transcribe(audio_file):
    try:
        # Load audio file
        waveform, sample_rate = torchaudio.load(audio_file)
        
        # Move waveform to the correct device
        waveform = waveform.to(device)

        # Get the duration of the audio
        duration = waveform.shape[1] / sample_rate

        # Check if the audio is too short or too long
        if duration < MIN_LENGTH or duration > MAX_LENGTH:
            return f"Audio duration is too short or too long. Duration: {duration} seconds"

        # Resample if necessary
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000).to(device)
            waveform = resampler(waveform)
        
        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        
        # Move to CPU for numpy conversion
        waveform = waveform.cpu()
        audio_input = waveform.squeeze().numpy()
        
        # Ensure audio input is float32
        if audio_input.dtype != np.float32:
            audio_input = audio_input.astype(np.float32)
        
        # Process audio input
        input_values = processor(
            audio_input, 
            sampling_rate=16_000, 
            return_tensors="pt"
        ).input_values.to(device)
        
        # Convert to float16 if using CUDA
        if torch_dtype == torch.float16:
            input_values = input_values.half()

        # Generate transcription
        with torch.no_grad():
            logits = model(input_values).logits

        # Use language model for decoding
        transcription = processor.decode(logits[0].cpu().numpy())
        
        # Return the transcription in lowercase
        print(transcription)
        return transcription[0].lower()
    
    except Exception as e:
        return f"Error during transcription: {str(e)}"

# Create Gradio interface

css = """
.textbox1 textarea {
    font-size: 18px !important;
    font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
    line-height: 1.8 !important;
}
.textbox2 textarea {
    display: none;
}
"""

demo = gr.Blocks(css=css)

tab_audio = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources=["upload","microphone"], type="filepath", label="Audio"),
    ],
    outputs=gr.Textbox(label="Transcription", rtl=True, elem_classes="textbox1"),
    title="Transcribe Dhivehi Audio",
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([tab_audio], ["Audio"])


def install_requirements():
    requirements_path = 'requirements.txt'
    
    # Check if requirements.txt exists
    if not os.path.exists(requirements_path):
        print("Error: requirements.txt not found")
        return False
        
    try:
        print("Installing requirements...")
        # Using --no-cache-dir to avoid memory issues
        subprocess.check_call([
            sys.executable, 
            "-m", 
            "pip", 
            "install", 
            "-r", 
            requirements_path,
            "--no-cache-dir"
        ])
        print("Successfully installed all requirements")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error installing requirements: {e}")
        return False
    except Exception as e:
        print(f"Unexpected error: {e}")
        return False

# Launch the interface
if __name__ == "__main__":
    success = install_requirements()
    if success:
        print("All requirements installed successfully")
      
        from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
        import torch
        import torchaudio
        import numpy as np

        # Device and dtype configuration
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

        MODEL_NAME = "alakxender/wav2vec2-large-mms-1b-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-cv" # Trained on Common Voice Data (Unknown Hours)
        # MODEL_NAME =  "/home/rusputin/lab/audio/fine-tunes/whisper-small-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)   
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-small-cv" # Trained on Common Voice Data (Unknown Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-medium-cv" # Trained on Common Voice Data (Unknown Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-dv-syn-md" # Trained on 100% Synthetic Data (150 Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-cv" # Trained on Common Voice Data (Unknown Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/whisper-large-v3-calls-md" # Trained on phone calls (65 Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-mms-1b-calls-md" # Trained on phone calls (65 Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-calls-md" # Trained on phone calls (23 Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/wav2vec2-large-xlsr-dv-syn-md" # Trained on 100% Synthetic Data (80 Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc" # Trained on multiple datasets (350+ Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-ctc-v2" # Trained on multiple datasets (350+ Hours)
        # MODEL_NAME = "/home/rusputin/lab/audio/fine-tunes/dhivehi-asr-full-whisper-v3" # Trained on multiple datasets (350+ Hours)

        # Load model and processor with LM
        processor = Wav2Vec2ProcessorWithLM.from_pretrained(MODEL_NAME)
        model = Wav2Vec2ForCTC.from_pretrained(
            MODEL_NAME,
            torch_dtype=torch_dtype
        ).to(device)

        MAX_LENGTH = 120 # 2 minutes
        MIN_LENGTH = 1 # 1 second

        demo.launch()
    else:
        print("Failed to install some requirements")