import time
import os
import re
import torch
import gradio as gr
import spaces
from transformers import AutoFeatureExtractor, AutoTokenizer, WhisperForConditionalGeneration, WhisperProcessor, pipeline
from huggingface_hub import model_info
try:
import flash_attn
FLASH_ATTENTION = True
except ImportError:
FLASH_ATTENTION = False
import yt_dlp # Added import for yt-dlp
MODEL_NAME = "NbAiLab/nb-whisper-large"
lang = "no"
logo_path = os.path.join(os.path.dirname(__file__), "Logo.png")
share = (os.environ.get("SHARE", "False")[0].lower() in "ty1") or None
auth_token = os.environ.get("AUTH_TOKEN") or True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Bruker enhet: {device}")
@spaces.GPU(duration=60 * 2)
def pipe(file, return_timestamps=False):
asr = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=28,
device=device,
token=auth_token,
torch_dtype=torch.float16,
model_kwargs={"attn_implementation": "flash_attention_2", "num_beams": 5} if FLASH_ATTENTION else {"attn_implementation": "sdpa", "num_beams": 5},
)
asr.model.config.forced_decoder_ids = asr.tokenizer.get_decoder_prompt_ids(
language=lang,
task="transcribe",
no_timestamps=not return_timestamps,
)
return asr(file, return_timestamps=return_timestamps, batch_size=24)
def format_output(text):
# Add a newline after ".", "!", ":", or "?" unless part of sequences like "..."
text = re.sub(r'(? {end_time}] {chunk['text']}"
text.append(line)
formatted_text = "\n".join(text)
formatted_text += "
Transkribert med NB-Whisper demo"
return formatted_text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'