File size: 3,781 Bytes
6c226f9
8e787d3
6c226f9
d790c0b
7bbd83c
d790c0b
88183ad
7bbd83c
 
 
 
6c226f9
a5bfe25
9d6fa91
7bbd83c
6c226f9
 
7bbd83c
6c226f9
7bbd83c
 
 
6c226f9
7bbd83c
 
 
 
 
 
6c226f9
7bbd83c
3c0cd8e
 
 
7bbd83c
6c226f9
d790c0b
 
 
 
 
 
 
7bbd83c
d790c0b
 
 
 
7bbd83c
d790c0b
 
 
 
7bbd83c
d790c0b
7bbd83c
66efbc3
7bbd83c
d790c0b
 
 
 
 
b97a3c2
 
3c0cd8e
7bbd83c
 
6c226f9
7bbd83c
 
 
 
 
6c226f9
 
 
7bbd83c
6c226f9
7097513
3ce82e9
 
7097513
7bbd83c
a5bfe25
6c226f9
b95b5ca
 
6c226f9
 
 
 
 
 
7bbd83c
6c226f9
ab14d7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
from huggingface_hub import CommitScheduler
import tempfile
import os
import json
from datetime import datetime
from pathlib import Path
from uuid import uuid4

MODEL_NAME = "openai/whisper-large-v3"
BATCH_SIZE = 8
YT_LENGTH_LIMIT_S = 4800  # 1 hour limit

device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device)

JSON_DATASET_DIR = Path("json_dataset")
JSON_DATASET_DIR.mkdir(parents=True, exist_ok=True)
JSON_DATASET_PATH = JSON_DATASET_DIR / f"transcriptions-{uuid4()}.json"

scheduler = CommitScheduler(
    repo_id="your-dataset-repo",
    repo_type="dataset",
    folder_path=JSON_DATASET_DIR,
    path_in_repo="data",
)

def transcribe_audio(inputs, task):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    return text

def download_yt_audio(yt_url, filename):
    info_loader = youtube_dl.YoutubeDL()
    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))
    file_length = info["duration_string"]
    file_h_m_s = list(map(int, file_length.split(":")))
    if len(file_h_m_s) == 1:
        file_h_m_s.insert(0, 0)
    if len(file_h_m_s) == 2:
        file_h_m_s.insert(0, 0)
    file_length_s = sum(x * 60 ** i for i, x in enumerate(reversed(file_h_m_s)))
    if file_length_s > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
        raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
    ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([yt_url])

def yt_transcribe(yt_url, task):
    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            inputs = f.read()
    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
    save_transcription(yt_url, text)
    return text

def save_transcription(yt_url, transcription):
    with scheduler.lock:
        with JSON_DATASET_PATH.open("a") as f:
            json.dump({"url": yt_url, "transcription": transcription, "datetime": datetime.now().isoformat()}, f)
            f.write("\n")

demo = gr.Blocks()

yt_transcribe_interface = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe")
    ],
    outputs="text",
    title="Whisper Large V3: Transcribe YouTube",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
        " arbitrary length."
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([yt_transcribe_interface], ["YouTube"])

demo.queue().launch()