import gradio as gr from transformers import WhisperProcessor, WhisperForConditionalGeneration # Load model and processor processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") def transcribe_audio(audio_file) -> str: audio_data = audio_file.read() # Get audio features input_features = processor(audio_data, return_tensors="pt").input_features # Transcribe without forcing any context tokens so that the model tries to automatically detect the language model.config.forced_decoder_ids = None predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) return transcription[0] audio_input = gr.inputs.Audio(type="file", label="Upload an audio file") text_output = gr.outputs.Textbox(label="Transcription") iface = gr.Interface( fn=transcribe_audio, inputs=audio_input, outputs=text_output, title="Speech-to-Text using Whisper v2", description="Upload an audio file to transcribe it to text.", theme="Monochrome", live=True, capture_session=True, ) iface.launch()