File size: 1,210 Bytes
1ecbd6e
2cdb5e3
59d8ead
2cdb5e3
 
 
0d5a69e
948dd84
 
2cdb5e3
 
 
867943a
2cdb5e3
 
 
 
 
 
 
 
 
f9b726c
 
161393d
f9b726c
161393d
 
 
f9b726c
 
 
 
 
cd06e6a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

def transcribe_audio(audio_file) -> str:
    audio_data = audio_file.read()
    
    # Get audio features
    input_features = processor(audio_data, return_tensors="pt").input_features 

    # Transcribe without forcing any context tokens so that the model tries to automatically detect the language
    model.config.forced_decoder_ids = None
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    
    return transcription[0]

audio_input = gr.inputs.Audio(type="file", label="Upload an audio file")
text_output = gr.outputs.Textbox(label="Transcription")

iface = gr.Interface(
    fn=transcribe_audio,
    inputs=audio_input,
    outputs=text_output,
    title="Speech-to-Text using Whisper v2",
    description="Upload an audio file to transcribe it to text.",
    theme="Monochrome",
    live=True,
    capture_session=True,
)

iface.launch()