Spaces:
Sleeping
Sleeping
File size: 4,348 Bytes
88183ad 7bbd83c 68bab0c 7bbd83c 68bab0c b3ee19f 68bab0c 6c226f9 a11fbef 68bab0c 9d6fa91 554c0b5 6c226f9 68bab0c b3ee19f 68bab0c b3ee19f 68bab0c b3ee19f 68bab0c f4720e3 35c87e9 68bab0c 6c226f9 554c0b5 8dba9f0 7bbd83c 6c226f9 554c0b5 7bbd83c 520f263 7bbd83c 6c226f9 d790c0b 554c0b5 68bab0c 7bbd83c d790c0b 7bbd83c 66efbc3 68bab0c 6b09585 7bbd83c d790c0b b97a3c2 68bab0c 7bbd83c 6c226f9 68bab0c 7bbd83c 68bab0c 7bbd83c 6c226f9 7bbd83c 6c226f9 7097513 68bab0c 7097513 7bbd83c a5bfe25 6c226f9 b95b5ca 6c226f9 68bab0c 6c226f9 ab14d7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import os
import json
import time
from datetime import datetime
from pathlib import Path
from uuid import uuid4
import tempfile
import gradio as gr
import yt_dlp as youtube_dl
from huggingface_hub import CommitScheduler
from transformers import (
BitsAndBytesConfig,
AutoModelForSpeechSeq2Seq,
AutoTokenizer,
AutoFeatureExtractor,
pipeline,
)
from transformers.pipelines.audio_utils import ffmpeg_read
import torch # If you're using PyTorch
import spaces
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
YT_LENGTH_LIMIT_S = 4800 # 1 hour 20 minutes
# Quantization
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
use_cache=False,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
# bnb_config = bnb.QuantizationConfig(bits=4)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
chunk_length_s=30,
# device=device,
)
# Define paths and create directory if not exists
JSON_DATASET_DIR = Path("json_dataset")
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"
# Initialize CommitScheduler for saving data to Hugging Face Dataset
scheduler = CommitScheduler(
repo_id="transcript-dataset-repo",
repo_type="dataset",
folder_path=JSON_DATASET_DIR,
path_in_repo="data",
)
def download_yt_audio(yt_url, filename):
info_loader = youtube_dl.YoutubeDL()
try:
info = info_loader.extract_info(yt_url, download=False)
except youtube_dl.utils.DownloadError as err:
raise gr.Error(str(err))
file_length = info["duration"]
if file_length > YT_LENGTH_LIMIT_S:
yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
raise gr.Error(
f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
)
ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
@spaces.GPU
def yt_transcribe(yt_url, task):
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "video.mp4")
download_yt_audio(yt_url, filepath)
with open(filepath, "rb") as f:
inputs = f.read()
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(
inputs,
batch_size=BATCH_SIZE,
generate_kwargs={"task": task},
return_timestamps=True,
)["text"]
save_transcription(yt_url, text)
return text
def save_transcription(yt_url, transcription):
with scheduler.lock:
with JSON_DATASET_PATH.open("a") as f:
json.dump(
{
"url": yt_url,
"transcription": transcription,
"datetime": datetime.now().isoformat(),
},
f,
)
f.write("\n")
demo = gr.Blocks()
yt_transcribe_interface = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(
lines=1,
placeholder="Paste the URL to a YouTube video here",
label="YouTube URL",
),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="Whisper Large V3: Transcribe YouTube",
description=(
"Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
" arbitrary length."
),
allow_flagging="never",
)
with demo:
gr.TabbedInterface(
[yt_transcribe_interface], ["YouTube"]
)
demo.queue().launch()
|