Spaces:
Running
on
T4
Running
on
T4
File size: 5,129 Bytes
69dba80 07ebfb1 0f57ece 07ebfb1 e1a5899 07ebfb1 5b11f8b 5aa1892 07ebfb1 311ebef 99d9b3e 07ebfb1 446a864 07ebfb1 5f56b9e 07ebfb1 6351056 0f57ece 07ebfb1 1c4706b 769dbd6 84d6345 983c638 1c4706b ddb3a35 983c638 35af703 ecc7149 983c638 602514f 35af703 07ebfb1 0f57ece 1c97cba 0f57ece 6a3ae5e 0f57ece 6a3ae5e 0f57ece 5f56b9e 0f57ece 07ebfb1 e1a5899 f4d4476 99d9b3e b9fdb45 e1a5899 07ebfb1 f4d4476 07ebfb1 0f57ece 07ebfb1 0f57ece 569a668 07ebfb1 fb212c9 0f57ece 5c6b453 5f56b9e 0f57ece 8913af2 0f57ece 7d283fd 99d9b3e 0f57ece 87ec1bf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
import time
import os
import re
import torch
import gradio as gr
import spaces
from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
from huggingface_hub import model_info
try:
import flash_attn
FLASH_ATTENTION = True
except ImportError:
FLASH_ATTENTION = False
import yt_dlp # Added import for yt-dlp
MODEL_NAME = "NbAiLab/nb-whisper-large"
lang = "no"
logo_path = os.path.join(os.path.dirname(__file__), "Logo.png")
share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
auth_token = os.environ.get("AUTH_TOKEN") or True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Bruker enhet: {device}")
@spaces.GPU(duration=60 * 2)
def pipe(file, return_timestamps=False):
asr = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=28,
device=device,
token=auth_token,
torch_dtype=torch.float16,
model_kwargs={"attn_implementation": "flash_attention_2", "num_beams": 5} if FLASH_ATTENTION else {"attn_implementation": "sdpa", "num_beams": 5},
)
asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
language=lang,
task="transcribe",
no_timestamps=not return_timestamps,
)
return asr(file, return_timestamps=return_timestamps, batch_size=24)
def format_output(text):
# Add a newline after ".", "!", ":", or "?" unless part of sequences like "..."
text = re.sub(r'(?<!\.)[.!:?](?!\.)', lambda m: m.group() + '\n', text)
# Ensure newline after sequences like "..." or other punctuation patterns
text = re.sub(r'(\.{3,}|[.!:?])', lambda m: m.group() + '\n\n', text)
return text
def transcribe(file, return_timestamps=False):
if not return_timestamps:
text = pipe(file)["text"]
formatted_text = format_output(text)
else:
chunks = pipe(file, return_timestamps=True)["chunks"]
text = []
for chunk in chunks:
start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
line = f"[{start_time} -> {end_time}] {chunk['text']}"
text.append(line)
formatted_text = "\n".join(text)
formatted_text += "<br><br><i>Transkribert med NB-Whisper demo</i>"
return formatted_text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, return_timestamps=False):
html_embed_str = _return_yt_html_embed(yt_url)
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': 'audio.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'quiet': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
text = transcribe("audio.mp3", return_timestamps=return_timestamps)
return html_embed_str, text
# Lag Gradio-appen uten faner
demo = gr.Blocks()
with demo:
gr.Image(value=logo_path, type="filepath", label=None, width=100)
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
gr.components.Checkbox(label="Inkluder tidsstempler"),
],
outputs=gr.HTML(label="text"),
title="NB-Whisper",
description=(
"Transkriber lange lydopptak fra mikrofon eller lydfiler med et enkelt klikk! Demoen bruker den fintunede"
f" modellen [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler opp til 30 minutter."
),
allow_flagging="never",
#show_submit_button=False,
)
# Uncomment to add the YouTube transcription interface if needed
# yt_transcribe_interface = gr.Interface(
# fn=yt_transcribe,
# inputs=[
# gr.components.Textbox(lines=1, placeholder="Lim inn URL til en YouTube-video her", label="YouTube URL"),
# gr.components.Checkbox(label="Inkluder tidsstempler"),
# ],
# examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
# outputs=["html", "text"],
# title="Whisper Demo: Transkriber YouTube",
# description=(
# "Transkriber lange YouTube-videoer med et enkelt klikk! Demoen bruker den fintunede modellen:"
# f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler av"
# " vilkårlig lengde."
# ),
# allow_flagging="never",
# )
# Start demoen uten faner
demo.launch(share=share, show_api=False).queue() |