Spaces:
Running
on
T4
Running
on
T4
File size: 6,486 Bytes
69dba80 07ebfb1 0f57ece 07ebfb1 37b5da4 e1a5899 07ebfb1 5b11f8b 5aa1892 07ebfb1 311ebef 99d9b3e 07ebfb1 446a864 07ebfb1 f9285e5 5f56b9e 37b5da4 07ebfb1 6351056 0f57ece 07ebfb1 1c4706b 769dbd6 84d6345 983c638 1c4706b ddb3a35 983c638 35af703 ecc7149 983c638 602514f 35af703 07ebfb1 0f57ece 1c97cba 0f57ece 6a3ae5e 37b5da4 6a3ae5e 37b5da4 0f57ece 6a3ae5e 37b5da4 6a3ae5e 0f57ece 37b5da4 5f56b9e 37b5da4 0f57ece 07ebfb1 e1a5899 f4d4476 99d9b3e b9fdb45 e1a5899 07ebfb1 f4d4476 07ebfb1 0f57ece 07ebfb1 0f57ece 569a668 07ebfb1 f9285e5 6bf2e4b 81a1d8a 5610313 81a1d8a 0f57ece 5c6b453 5f56b9e f9285e5 0f57ece 8913af2 0f57ece 7d283fd 99d9b3e 0f57ece 6bf2e4b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import time
import os
import re
import torch
import torchaudio
import gradio as gr
import spaces
from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
from huggingface_hub import model_info
try:
import flash_attn
FLASH_ATTENTION = True
except ImportError:
FLASH_ATTENTION = False
import yt_dlp # Added import for yt-dlp
MODEL_NAME = "NbAiLab/nb-whisper-large"
lang = "no"
logo_path = os.path.join(os.path.dirname(__file__), "Logo_2.png")
max_audio_length= 1 * 60
share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
auth_token = os.environ.get("AUTH_TOKEN") or True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Bruker enhet: {device}")
@spaces.GPU(duration=60 * 2)
def pipe(file, return_timestamps=False):
asr = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=28,
device=device,
token=auth_token,
torch_dtype=torch.float16,
model_kwargs={"attn_implementation": "flash_attention_2", "num_beams": 5} if FLASH_ATTENTION else {"attn_implementation": "sdpa", "num_beams": 5},
)
asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
language=lang,
task="transcribe",
no_timestamps=not return_timestamps,
)
return asr(file, return_timestamps=return_timestamps, batch_size=24)
def format_output(text):
# Add a newline after ".", "!", ":", or "?" unless part of sequences like "..."
text = re.sub(r'(?<!\.)[.!:?](?!\.)', lambda m: m.group() + '\n', text)
# Ensure newline after sequences like "..." or other punctuation patterns
text = re.sub(r'(\.{3,}|[.!:?])', lambda m: m.group() + '\n\n', text)
return text
def transcribe(file, return_timestamps=False):
waveform, sample_rate = torchaudio.load(file)
audio_duration = waveform.size(1) / sample_rate
if audio_duration > MAX_AUDIO_LENGTH:
# Trim the waveform to the first 30 minutes
waveform = waveform[:, :int(MAX_AUDIO_LENGTH * sample_rate)]
truncated_file = "truncated_audio.wav"
torchaudio.save(truncated_file, waveform, sample_rate)
file_to_transcribe = truncated_file
truncated = True
else:
file_to_transcribe = file
truncated = False
if not return_timestamps:
text = pipe(file_to_transcribe)["text"]
formatted_text = format_output(text)
else:
chunks = pipe(file_to_transcribe, return_timestamps=True)["chunks"]
text = []
for chunk in chunks:
start_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][0])) if chunk["timestamp"][0] is not None else "??:??:??"
end_time = time.strftime('%H:%M:%S', time.gmtime(chunk["timestamp"][1])) if chunk["timestamp"][1] is not None else "??:??:??"
line = f"[{start_time} -> {end_time}] {chunk['text']}"
text.append(line)
formatted_text = "\n".join(text)
if truncated:
disclaimer = (
"\n\nDette er en demo. Det er ikke tillatt å bruke denne teksten i profesjonell sammenheng. "
"Vi anbefaler at hvis du trenger å transkribere lengre opptak, så kjører du enten modellen lokalt "
"eller sjekker denne siden for å se hvem som leverer løsninger basert på NB-Whisper: "
"https://github.com/NbAiLab/nostram/blob/main/leverandorer.md"
)
formatted_text += f"<br><br><i>{disclaimer}</i>"
formatted_text += "<br><br><i>Transkribert med NB-Whisper demo</i>"
return formatted_text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def yt_transcribe(yt_url, return_timestamps=False):
html_embed_str = _return_yt_html_embed(yt_url)
ydl_opts = {
'format': 'bestaudio/best',
'outtmpl': 'audio.%(ext)s',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'quiet': True,
}
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
ydl.download([yt_url])
text = transcribe("audio.mp3", return_timestamps=return_timestamps)
return html_embed_str, text
# Lag Gradio-appen uten faner
demo = gr.Blocks()
with demo:
with gr.Row():
gr.HTML("<img src='file/Logo_2.png'>")
with gr.Column(scale=8):
# Use Markdown for title and description
gr.Markdown(
"""
<h1 style="font-size: 3em;">NB-Whisper Demo</h1>
"""
)
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=[
gr.components.Audio(sources=['upload', 'microphone'], type="filepath"),
gr.components.Checkbox(label="Inkluder tidsstempler"),
],
outputs=gr.HTML(label="text"),
description=(
"Transkriber lange lydopptak fra mikrofon eller lydfiler med et enkelt klikk! Demoen bruker den fintunede"
f" modellen [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler opp til 30 minutter."
),
allow_flagging="never",
#show_submit_button=False,
)
# Uncomment to add the YouTube transcription interface if needed
# yt_transcribe_interface = gr.Interface(
# fn=yt_transcribe,
# inputs=[
# gr.components.Textbox(lines=1, placeholder="Lim inn URL til en YouTube-video her", label="YouTube URL"),
# gr.components.Checkbox(label="Inkluder tidsstempler"),
# ],
# examples=[["https://www.youtube.com/watch?v=mukeSSa5GKo"]],
# outputs=["html", "text"],
# title="Whisper Demo: Transkriber YouTube",
# description=(
# "Transkriber lange YouTube-videoer med et enkelt klikk! Demoen bruker den fintunede modellen:"
# f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) og 🤗 Transformers til å transkribere lydfiler av"
# " vilkårlig lengde."
# ),
# allow_flagging="never",
# )
# Start demoen uten faner
demo.launch(share=share, show_api=False,allowed_paths=["Logo_2.png"]).queue() |