File size: 3,700 Bytes
3335977 628e92e 3335977 628e92e 3335977 628e92e 3335977 4ce98cf 3335977 4ce98cf 3335977 628e92e 4ce98cf 628e92e 3335977 0fdbdb9 ef46332 628e92e 3335977 628e92e 3335977 5720c76 3335977 8a85641 3335977 628e92e 3335977 8a85641 3335977 628e92e 3335977 628e92e 3335977 aa45e11 3335977 628e92e 3335977 aa45e11 3335977 628e92e 3335977 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
import spaces
import torch
import gradio as gr
import yt_dlp as youtube_dl
import whisperx
import tempfile
import os
device = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 4
FILE_LIMIT_MB = 1000
COMPUTE_TYPE = "float32"
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
model = whisperx.load_model("large-v2", device,compute_type=COMPUTE_TYPE)
@spaces.GPU
def transcribe(inputs, task):
if inputs is None:
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
audio = whisperx.load_audio(inputs)
result = model.transcribe(audio, batch_size=BATCH_SIZE)
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.getenv("HF_TOKEN"), device=device)
diarize_segments = diarize_model(audio)
result = whisperx.assign_word_speakers(diarize_segments, result)
output_text = ""
for segment in result['segments']:
speaker = segment.get('speaker', 'Unknown Speaker')
text = segment['text']
output_text += f"{speaker}: {text}\n"
return output_text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
def download_yt_audio(yt_url, filename):
ydl_opts = {
"format": "bestaudio/best",
"outtmpl": filename,
"postprocessors": [{
"key": "FFmpegExtractAudio",
"preferredcodec": "wav",
"preferredquality": "192",
}],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
@spaces.GPU
def yt_transcribe(yt_url, task):
html_embed_str = _return_yt_html_embed(yt_url)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "audio.wav")
download_yt_audio(yt_url, filepath)
audio = whisperx.load_audio(filepath)
result = model.transcribe(audio, batch_size=BATCH_SIZE)
return html_embed_str, result["text"]
demo = gr.Blocks(theme=gr.themes.Ocean())
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="microphone", type="filepath"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="VerbaLens Demo 1 : Prototype",
description="Transcribe long-form microphone or audio inputs using WhisperX.",
allow_flagging="never",
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(sources="upload", type="filepath", label="Audio file"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs="text",
title="VerbaLens Demo 1 : Prototype",
description="Transcribe uploaded audio files using WhisperX.",
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.Radio(["transcribe", "translate"], label="Task", value="transcribe"),
],
outputs=["html", "text"],
title="VerbaLend Demo with WhisperX",
description="Transcribe YouTube videos using WhisperX.",
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
demo.queue().launch(ssr_mode=False)
|