File size: 2,539 Bytes
9ba2a1c a67942c 16ebd09 6bb833b 16ebd09 5a76806 16ebd09 9ba2a1c 16ebd09 9ba2a1c 16ebd09 a67942c 9b015dc 51499e8 6bb833b 16ebd09 6bb833b 16ebd09 6bb833b 16ebd09 674036d 16ebd09 674036d 3fb3e5a 674036d 3fb3e5a 9ba2a1c 5a76806 9b015dc 9ba2a1c 5a76806 16ebd09 9ba2a1c 16ebd09 3fb3e5a 9ba2a1c 9b015dc 6bb833b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import torch
import gradio as gr
import ffmpeg
import numpy as np
import whisper
MODEL_NAME = "large-v3"
SAMPLE_RATE = 16000
device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model(MODEL_NAME).to(device)
def load_audio(file):
try:
# This launches a subprocess to decode audio while down-mixing and resampling as necessary.
# Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
out, _ = (
ffmpeg.input(file, threads=0)
.output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
.run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
)
except ffmpeg.Error as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
def transcribe(audio_file, task):
if audio_file is None:
raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
try:
# Load audio
audio = load_audio(audio_file.name)
# Transcribe
result = model.transcribe(audio, task=task, language="en")
# Format output
output = ""
for segment in result["segments"]:
start_time = segment["start"]
end_time = segment["end"]
text = segment["text"]
output += f"[{format_timestamp(start_time)} -> {format_timestamp(end_time)}] {text}\n"
return output
except Exception as e:
raise gr.Error(f"Error processing audio file: {str(e)}")
def format_timestamp(seconds):
minutes, seconds = divmod(seconds, 60)
hours, minutes = divmod(minutes, 60)
return f"{int(hours):02d}:{int(minutes):02d}:{seconds:.2f}"
# Use specific Gradio components
audio_input = gr.components.File(label="Audio file", file_types=["audio"])
task_input = gr.components.Radio(["transcribe", "translate"], label="Task", default="transcribe")
output = gr.components.Textbox(label="Transcription with Timestamps")
demo = gr.Interface(
fn=transcribe,
inputs=[audio_input, task_input],
outputs=output,
title=f"Whisper {MODEL_NAME}: Transcribe Audio with Timestamps",
description=(
f"Transcribe audio files with Whisper {MODEL_NAME}. "
"Upload an audio file and choose whether to transcribe or translate. "
"The output includes timestamps for each transcribed segment."
),
)
if __name__ == "__main__":
demo.launch() |