Spaces:
Running
on
T4
Running
on
T4
File size: 4,440 Bytes
69dba80 07ebfb1 5b11f8b 5aa1892 07ebfb1 311ebef 07ebfb1 446a864 07ebfb1 6351056 ad6cbd0 07ebfb1 1c4706b 769dbd6 84d6345 983c638 1c4706b 983c638 35af703 311ebef 983c638 602514f 35af703 07ebfb1 6a3ae5e c1541fb 07ebfb1 6a3ae5e b9fdb45 07ebfb1 b9fdb45 07ebfb1 3d53725 b9fdb45 07ebfb1 0c101e3 446a864 07ebfb1 fa92b3f 07ebfb1 6a3ae5e 446a864 6a3ae5e ad19622 07ebfb1 311ebef 85fc302 311ebef 85fc302 311ebef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
import time
import os
import torch
import gradio as gr
import pytube as pt
import spaces
from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
from huggingface_hub import model_info
try:
import flash_attn
FLASH_ATTENTION = True
except ImportError:
FLASH_ATTENTION = False
MODEL_NAME = "NbAiLab/nb-whisper-large"
lang = "no"
share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
auth_token = os.environ.get("AUTH_TOKEN") or True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
@spaces.GPU(duration=60 * 2)
def pipe(file, return_timestamps=False):
asr = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
token=auth_token,
torch_dtype=torch.float16,
model_kwargs={"attn_implementation": "flash_attention_2"} if FLASH_ATTENTION else {"attn_implementation": "sdpa"},
)
asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
language=lang,
task="transcribe",
no_timestamps=not return_timestamps,
)
return asr(file, return_timestamps=return_timestamps, batch_size=24)
def transcribe(file, return_timestamps=False):
if not return_timestamps:
text = pipe(file)["text"]
else:
chunks = pipe(file, return_timestamps=True)["chunks"]
text = []
for chunk in chunks:
start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
line = f"[{start_time} -> {end_time}] {chunk['text']}"
text.append(line)
text = "\n".join(text)
return text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, return_timestamps=False):
try:
yt = pt.YouTube(yt_url)
except Exception as e:
return f"Error fetching YouTube video: {str(e)}"
html_embed_str = _return_yt_html_embed(yt_url)
audio_streams = yt.streams.filter(only_audio=True)
if not audio_streams:
return "No audio streams available for this video."
stream = audio_streams[0]
try:
stream.download(filename="audio.mp3")
except Exception as e:
return f"Error downloading audio: {str(e)}"
if not os.path.exists("audio.mp3"):
return "Downloaded audio file not found."
text = transcribe("audio.mp3", return_timestamps=return_timestamps)
return html_embed_str, text
demo = gr.Blocks()
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
gr.components.Checkbox(label="Return timestamps"),
],
outputs="text",
title="NB-Whisper Demo",
description=(
"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the the fine-tuned"
f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
" of arbitrary length."
),
allow_flagging="never",
)
yt_transcribe = gr.Interface(
fn=yt_transcribe,
inputs=[
gr.components.Textbox(lines=1, placeholder="Paste the URL to a YouTube video here", label="YouTube URL"),
gr.components.Checkbox(label="Return timestamps"),
],
examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
outputs=["html", "text"],
title="Whisper Demo: Transcribe YouTube",
description=(
"Transcribe long-form YouTube videos with the click of a button! Demo uses the the fine-tuned checkpoint:"
f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files of"
" arbitrary length."
),
allow_flagging="never",
)
with demo:
gr.TabbedInterface([
mf_transcribe,
yt_transcribe
], [
"Transkriber Lyd",
"Transkriber YouTube"
])
demo.launch(share=share).queue()
|