File size: 4,428 Bytes
6c226f9
 
f6a264d
6c226f9
f6a264d
 
d790c0b
f6a264d
 
6c226f9
7097513
9d6fa91
66efbc3
d790c0b
6c226f9
 
 
 
 
 
f6a264d
6c226f9
 
 
7097513
6c226f9
 
 
 
 
 
 
 
f6a264d
6c226f9
f6a264d
6c226f9
f6a264d
 
 
 
 
 
6c226f9
f6a264d
6c226f9
f6a264d
 
6c226f9
f6a264d
6c226f9
f6a264d
 
d790c0b
 
f6a264d
d790c0b
 
f6a264d
 
d790c0b
f6a264d
 
 
6c226f9
f6a264d
 
66efbc3
d790c0b
 
 
f6a264d
d790c0b
 
6c226f9
f6a264d
 
0a7fcda
d790c0b
6c226f9
 
 
 
 
 
 
 
 
 
609dcbe
6c226f9
 
 
 
b95b5ca
6c226f9
b95b5ca
6c226f9
 
 
 
 
 
 
 
7097513
 
609dcbe
7097513
6c226f9
 
 
b95b5ca
6c226f9
b95b5ca
 
6c226f9
 
 
 
 
 
 
 
f6a264d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import torch
import gradio as gr
import yt_dlp as yt
from transformers import pipeline
#from transformers.pipelines.audio_utils import ffmpeg_read
from typing import Tuple
import tempfile
import os
from yt_dlp import YoutubeDL

MODEL_NAME = "openai/whisper-large-v2"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    chunk_length_s=30,
    model=MODEL_NAME,
    device=device,
)

def transcribe(microphone, file_upload, task):
    warn_output = ""
    if (microphone is not None) and (file_upload is not None):
        warn_output = (
            "WARNING: You've uploaded an audio file and used the microphone. "
            "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
        )

    elif (microphone is None) and (file_upload is None):
        raise gr.InterfaceError("You have to either use the microphone or upload an audio file")

    file_size_mb = None

    if file_upload is not None:
        file_size_mb = os.stat(file_upload).st_size / (1024 * 1024)
        if file_size_mb > FILE_LIMIT_MB:
            raise gr.InterfaceError(
                f"File size exceeds file size limit. Got file of size {file_size_mb:.2f}MB for a limit of {FILE_LIMIT_MB}MB."
            )

    file_path = microphone if microphone is not None else file_upload

    with open(file_path, "rb") as f:
        inputs = f.read()

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task})["text"]

    return warn_output + text
    
def download_yt_audio(yt_url, filename):
    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
    with yt.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([yt_url])
        except yt.utils.ExtractorError as err:
            raise gr.InterfaceError(str(err))

def yt_transcribe(yt_url, task, max_filesize=75.0) -> Tuple[str, str]:
    with YoutubeDL({}) as ydl:
        info_dict = ydl.extract_info(yt_url, download=False)

    video_id = info_dict["id"]
    html_embed_str = f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe> </center>'

    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)

        with open(filepath, "rb") as f:
            inputs = f.read()

    #inputs = ffmpeg_read(inputs, pipeline.feature_extractor.sampling_rate)
    #inputs = {"array": inputs, "sampling_rate": pipeline.feature_extractor.sampling_rate}

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task})["text"]

    return html_embed_str, text

demo = gr.Blocks()

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.inputs.Audio(source="microphone", type="filepath", optional=True),
        gr.inputs.Audio(source="upload", type="filepath", optional=True),
        gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
    ],
    outputs="text",
    layout="horizontal",
    theme="huggingface",
    title="Whisper Large V2: Transcribe Audio",
    description=(
        "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
        f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
        " of arbitrary length."
    ),
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.inputs.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe")
    ],
    outputs=["html", "text"],
    layout="horizontal",
    theme="huggingface",
    title="Whisper Large V2: Transcribe YouTube",
    description=(
        "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
        f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
        " arbitrary length."
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, yt_transcribe], ["Transcribe Audio", "Transcribe YouTube"])

demo.launch(enable_queue=True)