whisper / app.py
KIFF's picture
Update app.py
674036d verified
raw
history blame
2.53 kB
import torch
import gradio as gr
from transformers import pipeline
import numpy as np
from pydub import AudioSegment
import io
MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
def transcribe(audio_file, task):
if audio_file is None:
raise gr.Error("No audio file submitted! Please upload an audio file before submitting your request.")
try:
# Read the audio file
audio = AudioSegment.from_file(audio_file)
# Convert to mono if stereo
if audio.channels > 1:
audio = audio.set_channels(1)
# Convert to 16kHz sample rate
audio = audio.set_frame_rate(16000)
# Convert to numpy array
samples = np.array(audio.get_array_of_samples()).astype(np.float32) / 32768.0
# Convert to the format expected by Whisper
inputs = {"array": samples, "sampling_rate": 16000}
result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
output = ""
for chunk in result["chunks"]:
start_time = chunk["timestamp"][0]
end_time = chunk["timestamp"][1]
text = chunk["text"]
output += f"[{format_timestamp(start_time)} -> {format_timestamp(end_time)}] {text}\n"
return output
except Exception as e:
raise gr.Error(f"Error processing audio file: {str(e)}")
def format_timestamp(seconds):
minutes, seconds = divmod(seconds, 60)
hours, minutes = divmod(minutes, 60)
return f"{int(hours):02d}:{int(minutes):02d}:{seconds:.2f}"
# Use specific Gradio components
audio_input = gr.components.File(label="Audio file", file_types=["audio"])
task_input = gr.components.Radio(["transcribe", "translate"], label="Task", default="transcribe")
output = gr.components.Textbox(label="Transcription with Timestamps")
demo = gr.Interface(
fn=transcribe,
inputs=[audio_input, task_input],
outputs=output,
title=f"Whisper Large V3: Transcribe Audio with Timestamps",
description=(
f"Transcribe audio files with Whisper Large V3 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}). "
"Upload an audio file and choose whether to transcribe or translate. "
"The output includes timestamps for each transcribed segment."
),
)
if __name__ == "__main__":
demo.launch()