Spaces:
Runtime error
Runtime error
File size: 4,730 Bytes
dad4b00 dad2a9b dad4b00 dad2a9b dad4b00 043212a dad4b00 dad2a9b dad4b00 ee74fc7 dad4b00 dad2a9b dad4b00 dad2a9b dad4b00 dad2a9b dad4b00 1545980 39e0470 dad4b00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import tempfile
import os
import time
# Constants
MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
FILE_LIMIT_MB = 25 # File size limit in MB
YT_LENGTH_LIMIT_S = 3600 # 1 hour YouTube file limit
# Device configuration (CUDA if available)
device = 0 if torch.cuda.is_available() else "cpu"
# Load Whisper model and processor
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME).to(device)
def transcribe_audio(inputs):
"""Transcribe audio using Whisper model."""
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
# Check file size (max 25MB)
if os.path.getsize(inputs) > FILE_LIMIT_MB * 1024 * 1024:
raise gr.Error(f"File size exceeds {FILE_LIMIT_MB}MB limit.")
# Preprocess audio input
audio_input = processor(inputs, return_tensors="pt", sampling_rate=16000).to(device)
# Generate transcription
predicted_ids = model.generate(audio_input.input_values, max_length=448)
# Decode the transcription output
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcription
def _return_yt_html_embed(yt_url):
"""Return YouTube embed HTML for display."""
video_id = yt_url.split("?v=")[-1]
html_embed = f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
return html_embed
def download_yt_audio(yt_url, filename):
"""Download audio from a YouTube URL."""
info_loader = youtube_dl.YoutubeDL()
try:
info = info_loader.extract_info(yt_url, download=False)
except youtube_dl.utils.DownloadError as err:
raise gr.Error(f"Download error: {str(err)}")
# Check video length
file_length_s = int(info.get("duration", 0))
if file_length_s > YT_LENGTH_LIMIT_S:
yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
raise gr.Error(f"Maximum YouTube video length is {yt_length_limit_hms}, but video is {file_length_hms}.")
# Download the video
ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download([yt_url])
except youtube_dl.utils.ExtractorError as err:
raise gr.Error(f"Error while downloading video: {str(err)}")
def yt_transcribe(yt_url):
"""Transcribe YouTube video using Whisper model."""
html_embed = _return_yt_html_embed(yt_url)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "video.mp4")
download_yt_audio(yt_url, filepath)
with open(filepath, "rb") as file:
audio_input = file.read()
# Process and transcribe
transcription = transcribe_audio(audio_input)
return html_embed, transcription
# Create Gradio interface
demo = gr.Blocks()
# Microphone transcription interface
mf_transcribe = gr.Interface(
fn=transcribe_audio,
inputs=[
gr.inputs.Audio(source="microphone", type="filepath", optional=True),
],
outputs="text",
layout="horizontal",
theme="huggingface",
title="Whisper Transcription (Microphone)",
description="Transcribe audio from your microphone. File size limit is 25MB."
)
# File upload transcription interface
file_transcribe = gr.Interface(
fn=transcribe_audio,
inputs=[
gr.inputs.Audio(source="upload", type="filepath", optional=True, label="Audio file"),
],
outputs="text",
layout="horizontal",
theme="huggingface",
title="Whisper Transcription (File)",
description="Upload an audio file to transcribe. File size limit is 25MB."
)
# YouTube video transcription interface
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.inputs.Textbox(lines=1, placeholder="Paste YouTube URL", label="YouTube URL"),
],
outputs=["html", "text"],
layout="horizontal",
theme="huggingface",
title="Free Transcript Maker",
description="Upload an audio file (WAV, MP3, etc.) up to 25MB to get its transcription. The transcript will be displayed and available for download. Please use responsibly."
)
with demo:
gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
demo.launch(enable_queue=True)
|