File size: 2,062 Bytes
e48a855
b5157ee
e8e1ee0
 
 
 
 
 
 
 
 
e48a855
0c81086
e48a855
 
 
 
 
 
967d7b8
b7a67ca
e48a855
0c81086
e48a855
 
 
 
 
0c81086
e48a855
 
 
0c81086
e48a855
 
 
 
 
 
 
 
 
 
 
0c81086
967d7b8
 
e48a855
 
e8e1ee0
 
967d7b8
e8e1ee0
e48a855
e8e1ee0
 
 
 
 
 
01f871e
e8e1ee0
e48a855
 
 
 
 
 
967d7b8
e48a855
 
967d7b8
b8622b9
dd040d4
c41db98
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import gradio as gr
import os
import torch
from transformers import (
    WhisperForConditionalGeneration,
    WhisperTokenizer,
    WhisperProcessor,
    pipeline
)

torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "sudoping01/whosper-large"

# Load model and create pipeline
model = WhisperForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",
    use_cache=True,
    attention_dropout=0.1,
    dropout=0.1,
    token=os.environ.get("HF_TOKEN")
)

model.config.suppress_tokens = []
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.max_length = 448
model.config.num_beams = 5

tokenizer = WhisperTokenizer.from_pretrained(model_id)
processor = WhisperProcessor.from_pretrained(model_id)
feature_extractor = processor.feature_extractor

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer,
    feature_extractor=processor.feature_extractor,
    torch_dtype=torch_dtype,
    chunk_length_s=30,
    stride_length_s=3,
    return_timestamps=False,
    batch_size=1
)

def transcribe(audio):
    if audio is None:
        return "Please provide an audio input."
    
    try:
        result = pipe(
            audio,
            generate_kwargs={
                "temperature": 0.0,
                "do_sample": False,
                "num_beams": 5,
                "length_penalty": 1.0,
                "repetition_penalty": 1.2
            }
        )
        return result["text"]
    except Exception as e:
        return f"Error during transcription: {str(e)}"

# Create Gradio interface
demo = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources=["microphone", "upload"], type="filepath")
    ],
    outputs=gr.Textbox(label="Transcription"),
    title="Multilingual Speech Recognition: Wolof, French, English, .. or Mix",
    description="Upload an audio file or record audio to transcribe Wolof, French, or English speech...",
    theme="default"
)

if __name__ == "__main__":
    demo.launch()