import gradio as gr import os import torch from transformers import ( WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor, pipeline ) torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "sudoping01/whosper-large" # Load model and create pipeline model = WhisperForConditionalGeneration.from_pretrained( model_id, device_map="auto", use_cache=True, attention_dropout=0.1, dropout=0.1, token=os.environ.get("HF_TOKEN") ) model.config.suppress_tokens = [] model.config.no_repeat_ngram_size = 3 model.config.early_stopping = True model.config.max_length = 448 model.config.num_beams = 5 tokenizer = WhisperTokenizer.from_pretrained(model_id) processor = WhisperProcessor.from_pretrained(model_id) feature_extractor = processor.feature_extractor pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=processor.feature_extractor, torch_dtype=torch_dtype, chunk_length_s=30, stride_length_s=3, return_timestamps=False, batch_size=1 ) def transcribe(audio): if audio is None: return "Please provide an audio input." try: result = pipe( audio, generate_kwargs={ "temperature": 0.0, "do_sample": False, "num_beams": 5, "length_penalty": 1.0, "repetition_penalty": 1.2 } ) return result["text"] except Exception as e: return f"Error during transcription: {str(e)}" # Create Gradio interface demo = gr.Interface( fn=transcribe, inputs=[ gr.Audio(sources=["microphone", "upload"], type="filepath") ], outputs=gr.Textbox(label="Transcription"), title="Multilingual Speech Recognition: Wolof, French, English, .. or Mix", description="Upload an audio file or record audio to transcribe Wolof, French, or English speech...", theme="default" ) if __name__ == "__main__": demo.launch()