File size: 3,700 Bytes
3335977
 
628e92e
3335977
628e92e
3335977
 
628e92e
3335977
4ce98cf
3335977
4ce98cf
3335977
628e92e
4ce98cf
628e92e
3335977
 
 
 
 
 
0fdbdb9
ef46332
 
 
 
 
 
 
 
 
 
 
 
628e92e
3335977
 
 
628e92e
3335977
 
 
 
 
 
 
 
 
 
5720c76
3335977
 
 
 
 
 
8a85641
3335977
 
 
628e92e
3335977
 
8a85641
3335977
628e92e
3335977
628e92e
3335977
 
 
 
 
 
 
aa45e11
3335977
 
 
628e92e
3335977
 
 
 
 
 
 
aa45e11
3335977
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628e92e
3335977
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import spaces
import torch
import gradio as gr
import yt_dlp as youtube_dl
import whisperx
import tempfile
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4
FILE_LIMIT_MB = 1000
COMPUTE_TYPE = "float32" 
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

model = whisperx.load_model("large-v2", device,compute_type=COMPUTE_TYPE)

@spaces.GPU
def transcribe(inputs, task):
    if inputs is None:
        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
    
    audio = whisperx.load_audio(inputs)
    result = model.transcribe(audio, batch_size=BATCH_SIZE)
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
    diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.getenv("HF_TOKEN"), device=device)
    diarize_segments = diarize_model(audio)
    result = whisperx.assign_word_speakers(diarize_segments, result)
    output_text = ""
    for segment in result['segments']:
        speaker = segment.get('speaker', 'Unknown Speaker')
        text = segment['text']
        output_text += f"{speaker}: {text}\n"
        
    return output_text

def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'

def download_yt_audio(yt_url, filename):
    ydl_opts = {
        "format": "bestaudio/best",
        "outtmpl": filename,
        "postprocessors": [{
            "key": "FFmpegExtractAudio",
            "preferredcodec": "wav",
            "preferredquality": "192",
        }],
    }
    
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([yt_url])

@spaces.GPU
def yt_transcribe(yt_url, task):
    html_embed_str = _return_yt_html_embed(yt_url)
    
    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "audio.wav")
        download_yt_audio(yt_url, filepath)
        
        audio = whisperx.load_audio(filepath)
        result = model.transcribe(audio, batch_size=BATCH_SIZE)
    
    return html_embed_str, result["text"]

demo = gr.Blocks(theme=gr.themes.Ocean())

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="microphone", type="filepath"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="VerbaLens Demo 1 : Prototype",
    description="Transcribe long-form microphone or audio inputs using WhisperX.",
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(sources="upload", type="filepath", label="Audio file"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs="text",
    title="VerbaLens Demo 1 : Prototype",
    description="Transcribe uploaded audio files using WhisperX.",
    allow_flagging="never",
)

yt_transcribe = gr.Interface(
    fn=yt_transcribe,
    inputs=[
        gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
        gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
    ],
    outputs=["html", "text"],
    title="VerbaLend Demo with WhisperX",
    description="Transcribe YouTube videos using WhisperX.",
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])

demo.queue().launch(ssr_mode=False)