Spaces:
Runtime error
Runtime error
File size: 2,943 Bytes
dad2a9b bdc7600 dad2a9b d09d492 bdc7600 d09d492 bdc7600 dad4b00 bdc7600 d09d492 bdc7600 dad4b00 bdc7600 dad4b00 bdc7600 d09d492 bdc7600 dad4b00 bdc7600 d09d492 bdc7600 dad4b00 bdc7600 d09d492 bdc7600 dad4b00 bdc7600 dad4b00 bdc7600 dad4b00 bdc7600 d09d492 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import gradio as gr
import os
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import numpy as np
import librosa
# Initialize Whisper model
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
# Set light green theme
theme = gr.themes.Base(
primary_hue="emerald",
secondary_hue="emerald",
neutral_hue="gray",
)
def validate_file(file_path):
# Check if file exists
if not file_path or not os.path.exists(file_path):
return False, "No file uploaded or file not found."
# Check file size (25 MB limit)
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
if file_size_mb > 25:
return False, f"File size is {file_size_mb:.2f} MB. Please upload a file smaller than 25 MB."
# Check file extension
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension not in ['.mp3', '.wav']:
return False, "Only .mp3 and .wav formats are supported."
return True, "File is valid."
def transcribe_audio(audio_file):
# Check if audio_file is None
if audio_file is None:
return "Please upload an audio file."
# Validate the file first
is_valid, message = validate_file(audio_file)
if not is_valid:
return message
try:
# Load audio file
speech_array, sampling_rate = librosa.load(audio_file, sr=16000)
# Process the audio file
input_features = processor(speech_array, sampling_rate=16000, return_tensors="pt").input_features
# Generate token ids
predicted_ids = model.generate(input_features)
# Decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
except Exception as e:
return f"An error occurred during transcription: {str(e)}"
# Create Gradio interface
with gr.Blocks(theme=theme) as demo:
gr.Markdown("# Audio Transcription with Whisper")
gr.Markdown("Upload an audio file (.mp3 or .wav) of maximum 25MB to get the transcription.")
with gr.Row():
with gr.Column():
# Fixed: Use sources parameter instead of type
audio_input = gr.Audio(sources=["upload"], label="Upload Audio File")
submit_btn = gr.Button("Transcribe", variant="primary")
with gr.Column():
output = gr.Textbox(label="Transcription Result", lines=10)
submit_btn.click(fn=transcribe_audio, inputs=audio_input, outputs=output)
gr.Markdown("### Limitations")
gr.Markdown("- Maximum file size: 25 MB")
gr.Markdown("- Supported formats: .mp3 and .wav")
gr.Markdown("- Uses the Whisper base model which works best with clear audio")
# Launch the app
if __name__ == "__main__":
demo.launch() |