Spaces:
Running
on
Zero
Running
on
Zero
File size: 6,318 Bytes
88183ad 7bbd83c 68bab0c 7bbd83c 68bab0c 416dca9 68bab0c b3ee19f 416dca9 751197e a11fbef 416dca9 68bab0c 9d6fa91 554c0b5 416dca9 6c226f9 416dca9 b3ee19f 416dca9 b3ee19f 68bab0c b3ee19f 68bab0c b3ee19f 68bab0c f4720e3 68bab0c 6c226f9 416dca9 6c226f9 d790c0b 416dca9 554c0b5 68bab0c 416dca9 7bbd83c d790c0b 7bbd83c 416dca9 66efbc3 751197e 7bbd83c 416dca9 d790c0b 416dca9 d790c0b 416dca9 6c226f9 7bbd83c 6c226f9 7097513 68bab0c 7097513 7bbd83c a5bfe25 6c226f9 b95b5ca 6c226f9 68bab0c 6c226f9 ab14d7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import os
import json
import time
from datetime import datetime
from pathlib import Path
import tempfile
import pandas as pd
import gradio as gr
import yt_dlp as youtube_dl
from transformers import (
BitsAndBytesConfig,
AutoModelForSpeechSeq2Seq,
AutoTokenizer,
AutoFeatureExtractor,
pipeline,
)
from transformers.pipelines.audio_utils import ffmpeg_read
import torch # If you're using PyTorch
from datasets import load_dataset, Dataset, DatasetDict
import spaces
# Constants
MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
YT_LENGTH_LIMIT_S = 4800 # 1 hour 20 minutes
DATASET_NAME = "dwb2023/yt-transcripts-v3"
# Environment setup
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# Model setup
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_NAME,
quantization_config=bnb_config,
use_cache=False,
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
feature_extractor = AutoFeatureExtractor.from_pretrained(MODEL_NAME)
pipe = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=tokenizer,
feature_extractor=feature_extractor,
chunk_length_s=30,
)
def reset_and_update_dataset(new_data):
# Define the schema for an empty DataFrame
schema = {
"url": pd.Series(dtype="str"),
"transcription": pd.Series(dtype="str"),
"title": pd.Series(dtype="str"),
"duration": pd.Series(dtype="int"),
"uploader": pd.Series(dtype="str"),
"upload_date": pd.Series(dtype="datetime64[ns]"),
"description": pd.Series(dtype="str"),
"datetime": pd.Series(dtype="datetime64[ns]")
}
# Create an empty DataFrame with the defined schema
df = pd.DataFrame(schema)
# Append the new data
df = pd.concat([df, pd.DataFrame([new_data])], ignore_index=True)
# Convert back to dataset
updated_dataset = Dataset.from_pandas(df)
# Push the updated dataset to the hub
dataset_dict = DatasetDict({"train": updated_dataset})
dataset_dict.push_to_hub(DATASET_NAME)
print("Dataset reset and updated successfully!")
def download_yt_audio(yt_url, filename):
info_loader = youtube_dl.YoutubeDL()
try:
info = info_loader.extract_info(yt_url, download=False)
except youtube_dl.utils.DownloadError as err:
raise gr.Error(str(err))
file_length = info["duration"]
if file_length > YT_LENGTH_LIMIT_S:
yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
raise gr.Error(
f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video."
)
ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
return info
@spaces.GPU(duration=120)
def yt_transcribe(yt_url, task):
# Load the dataset
dataset = load_dataset(DATASET_NAME, split="train")
# Check if the transcription already exists
for row in dataset:
if row['url'] == yt_url:
return row['transcription'] # Return the existing transcription
# If transcription does not exist, perform the transcription
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "video.mp4")
info = download_yt_audio(yt_url, filepath)
with open(filepath, "rb") as f:
video_data = f.read()
inputs = ffmpeg_read(video_data, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(
inputs,
batch_size=BATCH_SIZE,
generate_kwargs={"task": task},
return_timestamps=True,
)["text"]
# Extract additional fields
try:
title = info.get("title", "N/A")
duration = info.get("duration", 0)
uploader = info.get("uploader", "N/A")
upload_date = info.get("upload_date", "N/A")
description = info.get("description", "N/A")
except KeyError:
title = "N/A"
duration = 0
uploader = "N/A"
upload_date = "N/A"
description = "N/A"
save_transcription(yt_url, text, title, duration, uploader, upload_date, description)
return text
def save_transcription(yt_url, transcription, title, duration, uploader, upload_date, description):
data = {
"url": yt_url,
"transcription": transcription,
"title": title,
"duration": duration,
"uploader": uploader,
"upload_date": upload_date,
"description": description,
"datetime": datetime.now().isoformat()
}
# Load the existing dataset
dataset = load_dataset(DATASET_NAME, split="train")
# Convert to pandas dataframe
df = dataset.to_pandas()
# Append the new data
df = pd.concat([df, pd.DataFrame([data])], ignore_index=True)
# Convert back to dataset
updated_dataset = Dataset.from_pandas(df)
# Push the updated dataset to the hub
dataset_dict = DatasetDict({"train": updated_dataset})
dataset_dict.push_to_hub(DATASET_NAME)
demo = gr.Blocks()
yt_transcribe_interface = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(
lines=1,
placeholder="Paste the URL to a YouTube video here",
label="YouTube URL",
),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="Whisper Large V3: Transcribe YouTube",
description=(
"Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
" arbitrary length."
),
allow_flagging="never",
)
with demo:
gr.TabbedInterface(
[yt_transcribe_interface], ["YouTube"]
)
demo.queue().launch()
|