import gradio as gr import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline # Check if CUDA is available, and choose device accordingly device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 # Load the model and tokenizer model_id = "openai/whisper-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) # Define a function to transcribe audio pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=30, batch_size=16, return_timestamps=True, torch_dtype=torch_dtype, device=device, ) def transcribe_audio(audio_file): # Check if audio file is None #if audio_file is None: # raise ValueError("Input audio file is None.") # Use the pipeline to transcribe audio result = pipe(audio_file, generate_kwargs={"language": "english"}) transcribed_text = result["text"] return transcribed_text # Create a Gradio interface audio_input = gr.Audio(label="Upload Audio", type="filepath") output_text = gr.Textbox(label="Transcribed Text") # Instantiate the Gradio interface app = gr.Interface( fn=transcribe_audio, inputs=audio_input, outputs=output_text, title="Audio Transcription with Whisper Model", description="Upload an audio file to transcribe it into text using the Whisper model.", theme="compact" ) # Launch the Gradio interface app.launch(debug=True, inline=False)