sudoping01's picture
Update app.py
c41db98 verified
import gradio as gr
import os
import torch
from transformers import (
WhisperForConditionalGeneration,
WhisperTokenizer,
WhisperProcessor,
pipeline
)
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "sudoping01/whosper-large"
# Load model and create pipeline
model = WhisperForConditionalGeneration.from_pretrained(
model_id,
device_map="auto",
use_cache=True,
attention_dropout=0.1,
dropout=0.1,
token=os.environ.get("HF_TOKEN")
)
model.config.suppress_tokens = []
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.max_length = 448
model.config.num_beams = 5
tokenizer = WhisperTokenizer.from_pretrained(model_id)
processor = WhisperProcessor.from_pretrained(model_id)
feature_extractor = processor.feature_extractor
pipe = pipeline(
"automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=processor.feature_extractor,
torch_dtype=torch_dtype,
chunk_length_s=30,
stride_length_s=3,
return_timestamps=False,
batch_size=1
)
def transcribe(audio):
if audio is None:
return "Please provide an audio input."
try:
result = pipe(
audio,
generate_kwargs={
"temperature": 0.0,
"do_sample": False,
"num_beams": 5,
"length_penalty": 1.0,
"repetition_penalty": 1.2
}
)
return result["text"]
except Exception as e:
return f"Error during transcription: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources=["microphone", "upload"], type="filepath")
],
outputs=gr.Textbox(label="Transcription"),
title="Multilingual Speech Recognition: Wolof, French, English, .. or Mix",
description="Upload an audio file or record audio to transcribe Wolof, French, or English speech...",
theme="default"
)
if __name__ == "__main__":
demo.launch()