Spaces:
Running
Running
File size: 4,648 Bytes
9ba2a1c a67942c 9ba2a1c bce555f 9ba2a1c a5bf333 9ba2a1c c3e624b 9ba2a1c bce555f 3b5175f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c 57bf973 9ba2a1c a67942c bce555f 51499e8 9334a23 891b8fc a67942c 891b8fc 9334a23 891b8fc 9334a23 891b8fc 9334a23 891b8fc bce555f 9334a23 bce555f 9ba2a1c bce555f 42f6a29 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c bce555f 9ba2a1c a5bf333 9ba2a1c 42f6a29 bce555f 9ba2a1c bce555f a5bf333 9ba2a1c 42f6a29 bce555f 9ba2a1c bce555f 9ba2a1c a5bf333 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 |
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from urllib.parse import urlparse, parse_qs
import tempfile
import time
import os
import numpy as np
# Constants
MODEL_NAME = "dataprizma/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # 1 hour limit
# Device selection
device = 0 if torch.cuda.is_available() else "cpu"
# Load Whisper pipeline
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
# Extract YouTube Video ID
def _extract_yt_video_id(yt_url):
parsed_url = urlparse(yt_url)
return parse_qs(parsed_url.query).get("v", [""])[0]
# Embed YouTube Video in HTML
def _return_yt_html_embed(yt_url):
video_id = _extract_yt_video_id(yt_url)
if not video_id:
raise gr.Error("Invalid YouTube URL. Please check and try again.")
return f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe> </center>'
# Transcription function (Fix applied)
def transcribe(audio_file, task):
if audio_file is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting.")
# Open file as binary to ensure correct data type
with open(audio_file, "rb") as f:
audio_data = f.read()
# Read audio using ffmpeg_read (correcting input format)
audio_array = ffmpeg_read(audio_data, pipe.feature_extractor.sampling_rate)
# Convert to proper format
inputs = {
"raw": np.array(audio_array),
"sampling_rate": pipe.feature_extractor.sampling_rate
}
# Perform transcription
result = pipe(
inputs,
batch_size=BATCH_SIZE,
generate_kwargs={"task": task},
return_timestamps=True
)
return result["text"]
# Download YouTube audio
def download_yt_audio(yt_url, filename):
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": filename,
"postprocessors": [
{"key": "FFmpegExtractAudio", "preferredcodec": "mp3", "preferredquality": "192"}
],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
info = ydl.extract_info(yt_url, download=False)
file_length_s = info.get("duration", 0) # Duration in seconds
if file_length_s > YT_LENGTH_LIMIT_S:
raise gr.Error(f"Maximum YouTube length is 1 hour. Your video is {file_length_s // 3600}h {file_length_s % 3600 // 60}m {file_length_s % 60}s.")
ydl.download([yt_url])
except youtube_dl.utils.DownloadError as err:
raise gr.Error(str(err))
# YouTube transcription function
def yt_transcribe(yt_url, task, max_filesize=75.0):
html_embed_str = _return_yt_html_embed(yt_url)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "audio.mp3")
download_yt_audio(yt_url, filepath)
if os.path.getsize(filepath) > max_filesize * 1024 * 1024:
raise gr.Error(f"File too large! Max allowed size is {max_filesize}MB.")
with open(filepath, "rb") as f:
inputs = ffmpeg_read(f.read(), pipe.feature_extractor.sampling_rate)
inputs = {
"array": inputs,
"sampling_rate": pipe.feature_extractor.sampling_rate,
"attention_mask": torch.ones(len(inputs), dtype=torch.long),
}
text = pipe(
{"input_features": inputs},
batch_size=BATCH_SIZE,
generate_kwargs={"task": task, "forced_decoder_ids": None},
return_timestamps=True
)["text"]
return html_embed_str, text
# Gradio UI
demo = gr.Blocks()
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(type="filepath", label="Audio file"),
gr.Radio(["transcribe", "translate"], label="Task"),
],
outputs="text",
title="Whisper Large V3: Transcribe Audio",
description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
flagging_mode="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(lines=1, placeholder="Paste YouTube URL here", label="YouTube URL"),
gr.Radio(["transcribe", "translate"], label="Task")
],
outputs=["html", "text"],
title="Whisper Large V3: Transcribe YouTube",
description="Whisper Large V3 fine-tuned for Uzbek language by Dataprizma",
flagging_mode="never",
)
with demo:
gr.TabbedInterface([file_transcribe, yt_transcribe], ["Audio file", "YouTube"])
demo.launch()
|