import gradio as gr
from transformers import WhisperProcessor, WhisperForConditionalGeneration

# Load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")

def transcribe_audio(audio_file) -> str:
    audio_data = audio_file.read()
    
    # Get audio features
    input_features = processor(audio_data, return_tensors="pt").input_features 

    # Transcribe without forcing any context tokens so that the model tries to automatically detect the language
    model.config.forced_decoder_ids = None
    predicted_ids = model.generate(input_features)
    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    
    return transcription[0]

audio_input = gr.inputs.Audio(type="file", label="Upload an audio file")
text_output = gr.outputs.Textbox(label="Transcription")

iface = gr.Interface(
    fn=transcribe_audio,
    inputs=audio_input,
    outputs=text_output,
    title="Speech-to-Text using Whisper v2",
    description="Upload an audio file to transcribe it to text.",
    theme="Monochrome",
    live=True,
    capture_session=True,
)

iface.launch()