File size: 2,539 Bytes
9ba2a1c
a67942c
16ebd09
6bb833b
16ebd09
5a76806
16ebd09
 
9ba2a1c
16ebd09
 
9ba2a1c
16ebd09
 
 
 
 
 
 
 
 
 
 
 
 
a67942c
9b015dc
 
 
51499e8
6bb833b
16ebd09
 
6bb833b
16ebd09
 
6bb833b
16ebd09
674036d
16ebd09
 
 
 
674036d
3fb3e5a
674036d
 
 
 
3fb3e5a
 
 
 
 
9ba2a1c
5a76806
 
 
 
 
9b015dc
9ba2a1c
5a76806
 
16ebd09
9ba2a1c
16ebd09
3fb3e5a
 
9ba2a1c
 
 
9b015dc
6bb833b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
import gradio as gr
import ffmpeg
import numpy as np
import whisper

MODEL_NAME = "large-v3"
SAMPLE_RATE = 16000

device = "cuda" if torch.cuda.is_available() else "cpu"
model = whisper.load_model(MODEL_NAME).to(device)

def load_audio(file):
    try:
        # This launches a subprocess to decode audio while down-mixing and resampling as necessary.
        # Requires the ffmpeg CLI and `ffmpeg-python` package to be installed.
        out, _ = (
            ffmpeg.input(file, threads=0)
            .output("-", format="s16le", acodec="pcm_s16le", ac=1, ar=SAMPLE_RATE)
            .run(cmd=["ffmpeg", "-nostdin"], capture_stdout=True, capture_stderr=True)
        )
    except ffmpeg.Error as e:
        raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e

    return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0

def transcribe(audio_file, task):
    if audio_file is None:
        raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")

    try:
        # Load audio
        audio = load_audio(audio_file.name)

        # Transcribe
        result = model.transcribe(audio, task=task, language="en")

        # Format output
        output = ""
        for segment in result["segments"]:
            start_time = segment["start"]
            end_time = segment["end"]
            text = segment["text"]
            output += f"[{format_timestamp(start_time)} -> {format_timestamp(end_time)}] {text}\n"

        return output

    except Exception as e:
        raise gr.Error(f"Error processing audio file: {str(e)}")

def format_timestamp(seconds):
    minutes, seconds = divmod(seconds, 60)
    hours, minutes = divmod(minutes, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{seconds:.2f}"

# Use specific Gradio components
audio_input = gr.components.File(label="Audio file", file_types=["audio"])
task_input = gr.components.Radio(["transcribe", "translate"], label="Task", default="transcribe")
output = gr.components.Textbox(label="Transcription with Timestamps")

demo = gr.Interface(
    fn=transcribe,
    inputs=[audio_input, task_input],
    outputs=output,
    title=f"Whisper {MODEL_NAME}: Transcribe Audio with Timestamps",
    description=(
        f"Transcribe audio files with Whisper {MODEL_NAME}. "
        "Upload an audio file and choose whether to transcribe or translate. "
        "The output includes timestamps for each transcribed segment."
    ),
)

if __name__ == "__main__":
    demo.launch()