File size: 1,452 Bytes
cbf2f18
83dc08b
cbf2f18
83dc08b
 
27ddc0e
83dc08b
93df753
83dc08b
93df753
 
fcd28b3
83dc08b
27ddc0e
83dc08b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93df753
 
83dc08b
93df753
c18cbed
83dc08b
cbf2f18
93df753
83dc08b
 
 
 
c18cbed
 
cbf2f18
83dc08b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy as np
import torch
import gradio as gr
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor

# Load ASR model & processor
asr_model_id = "Norphel/wav2vec2-large-mms-1b-dzo-colab"
asr_model = Wav2Vec2ForCTC.from_pretrained(asr_model_id, target_lang="dzo").to("cuda")
asr_processor = Wav2Vec2Processor.from_pretrained(asr_model_id)
asr_processor.tokenizer.set_target_lang("dzo")

# Function to process audio & generate text
def generate_text(audio):
    if audio is None:
        return "No audio recorded."

    sr, data = audio  # Gradio provides (sample_rate, waveform)
    
    # Resample to 16kHz if needed
    if sr != 16000:
        data = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)(torch.tensor(data))

    # Convert to model input format
    input_dict = asr_processor(data.numpy(), sampling_rate=16000, return_tensors="pt", padding=True)
    
    # Run model inference
    with torch.no_grad():
        logits = asr_model(input_dict.input_values.to("cuda")).logits
    pred_ids = torch.argmax(logits, dim=-1)[0]

    # Decode prediction
    return asr_processor.decode(pred_ids)

# Gradio interface
demo = gr.Interface(
    fn=generate_text,
    inputs=gr.Audio(type="numpy"),  # Automatically enables recording
    outputs="text",
    title="Dzongkha Speech-to-Text",
    description="Record your voice and get transcriptions in Dzongkha."
)

if __name__ == "__main__":
    demo.launch()