sudoping01 commited on
Commit
e8e1ee0
·
verified ·
1 Parent(s): c061ec6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import (
4
+ WhisperForConditionalGeneration,
5
+ WhisperTokenizer,
6
+ WhisperProcessor,
7
+ pipeline
8
+ )
9
+
10
+ # Initialize model and tokenizer
11
+ model_id = "sudoping01/whosper-large"
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
14
+
15
+ @gr.load(clean=True)
16
+ def load_model():
17
+ model = WhisperForConditionalGeneration.from_pretrained(
18
+ model_id,
19
+ device_map="auto",
20
+ use_cache=True,
21
+ attention_dropout=0.1,
22
+ dropout=0.1
23
+ )
24
+
25
+ model.config.suppress_tokens = []
26
+ model.config.no_repeat_ngram_size = 3
27
+ model.config.early_stopping = True
28
+ model.config.max_length = 448
29
+ model.config.num_beams = 5
30
+
31
+ tokenizer = WhisperTokenizer.from_pretrained(model_id)
32
+ processor = WhisperProcessor.from_pretrained(model_id)
33
+
34
+ return pipeline(
35
+ "automatic-speech-recognition",
36
+ model=model,
37
+ tokenizer=tokenizer,
38
+ feature_extractor=processor.feature_extractor,
39
+ torch_dtype=torch_dtype,
40
+ chunk_length_s=30,
41
+ stride_length_s=3,
42
+ return_timestamps=False,
43
+ batch_size=1
44
+ )
45
+
46
+ pipe = load_model()
47
+
48
+ def transcribe(audio_path):
49
+ if audio_path is None:
50
+ return "Please provide an audio input."
51
+
52
+ try:
53
+ result = pipe(
54
+ audio_path,
55
+ generate_kwargs={
56
+ "temperature": 0.0,
57
+ "do_sample": False,
58
+ "num_beams": 5,
59
+ "length_penalty": 1.0,
60
+ "repetition_penalty": 1.2
61
+ }
62
+ )
63
+ return result["text"]
64
+ except Exception as e:
65
+ return f"Error during transcription: {str(e)}"
66
+
67
+ #
68
+ css = """
69
+ .gradio-container {max-width: 800px !important}
70
+ .audio-btn {height: 40px; width: 40px;}
71
+ """
72
+
73
+ demo = gr.Interface(
74
+ fn=transcribe,
75
+ inputs=[
76
+ gr.Audio(source="microphone", type="filepath", label="Audio Input")
77
+ ],
78
+ outputs=gr.Textbox(label="Transcription"),
79
+ title="Whosper Large - Speech Recognition",
80
+ description="Upload an audio file or record audio to transcribe speech.",
81
+ article="""
82
+ This is a demo of the Whosper Large speech recognition model.
83
+ - Support for multiple languages : wolof, french, english ...
84
+ - Optimized for clarity and accuracy
85
+ - Real-time transcription capabilities
86
+ """,
87
+ examples=[],
88
+ css=css,
89
+ enable_queue=True
90
+ )
91
+
92
+
93
+ demo.launch()