File size: 3,839 Bytes
6c226f9
8e787d3
6c226f9
d790c0b
a414c37
7bbd83c
554c0b5
d790c0b
88183ad
7bbd83c
 
 
 
592b794
6c226f9
a11fbef
 
a5bfe25
9d6fa91
554c0b5
6c226f9
 
7bbd83c
6c226f9
554c0b5
8dba9f0
7bbd83c
 
6c226f9
554c0b5
7bbd83c
520f263
7bbd83c
 
 
 
6c226f9
c5ddbf5
592b794
7bbd83c
3c0cd8e
 
 
7bbd83c
6c226f9
d790c0b
 
 
 
 
 
554c0b5
 
 
 
d790c0b
7bbd83c
d790c0b
7bbd83c
66efbc3
c5ddbf5
592b794
7bbd83c
d790c0b
 
 
 
 
b97a3c2
 
3c0cd8e
7bbd83c
 
6c226f9
7bbd83c
 
 
 
 
6c226f9
 
 
7bbd83c
6c226f9
7097513
3ce82e9
 
7097513
7bbd83c
a5bfe25
6c226f9
b95b5ca
 
6c226f9
 
 
 
 
 
7bbd83c
6c226f9
ab14d7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import torch
from huggingface_hub import CommitScheduler
import spaces
import tempfile
import os
import json
from datetime import datetime
from pathlib import Path
from uuid import uuid4
from functools import lru_cache

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
YT_LENGTH_LIMIT_S = 4800  # 1 hour 20 minutes

device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device)

# Define paths and create directory if not exists
JSON_DATASET_DIR = Path("json_dataset")
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"

# Initialize CommitScheduler for saving data to Hugging Face Dataset
scheduler = CommitScheduler(
    repo_id="transcript-dataset-repo",
    repo_type="dataset",
    folder_path=JSON_DATASET_DIR,
    path_in_repo="data",
)

@spaces.GPU(duration=120)
@lru_cache(maxsize=10)
def transcribe_audio(inputs, task):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    return text

def download_yt_audio(yt_url, filename):
    info_loader = youtube_dl.YoutubeDL()
    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))
    file_length = info["duration"]
    if file_length > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%H:%M:%S", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%H:%M:%S", time.gmtime(file_length))
        raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
    ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([yt_url])

@spaces.GPU(duration=120)
@lru_cache(maxsize=10)
def yt_transcribe(yt_url, task):
    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            inputs = f.read()
    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    save_transcription(yt_url, text)
    return text

def save_transcription(yt_url, transcription):
    with scheduler.lock:
        with JSON_DATASET_PATH.open("a") as f:
            json.dump({"url": yt_url, "transcription": transcription, "datetime": datetime.now().isoformat()}, f)
            f.write("\n")

demo = gr.Blocks()

yt_transcribe_interface = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe YouTube",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
        " arbitrary length."
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([yt_transcribe_interface], ["YouTube"])

demo.queue().launch()