File size: 4,691 Bytes
7b70f57 2106f78 7b70f57 2106f78 7b70f57 2106f78 7b70f57 e4531cf 2106f78 7b70f57 2106f78 7b70f57 ee7903b 7b70f57 2106f78 7b70f57 ee7903b 7b70f57 ee7903b 2106f78 ee7903b 52007c6 2106f78 ee7903b 7b70f57 e4531cf 7b70f57 ee7903b 2106f78 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b cfbec94 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 ee7903b 7b70f57 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
import spaces
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import tempfile
import os
MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
@spaces.GPU
def transcribe(inputs):
if inputs is None:
raise gr.Error("未提供音訊檔案!請在提交請求前上傳或錄製一個音訊檔案。")
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
return text
def _return_yt_html_embed(yt_url):
video_id = yt_url.split("?v=")[-1]
HTML_str = (
f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
" </center>"
)
return HTML_str
def download_yt_audio(yt_url, filename):
info_loader = youtube_dl.YoutubeDL()
try:
info = info_loader.extract_info(yt_url, download=False)
except youtube_dl.utils.DownloadError as err:
raise gr.Error(str(err))
file_length = info["duration_string"]
file_h_m_s = file_length.split(":")
file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
if len(file_h_m_s) == 1:
file_h_m_s.insert(0, 0)
if len(file_h_m_s) == 2:
file_h_m_s.insert(0, 0)
file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
if file_length_s > YT_LENGTH_LIMIT_S:
yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
raise gr.Error(f"最大YouTube影片長度為 {yt_length_limit_hms},但提供的影片長度為 {file_length_hms}。")
ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
try:
ydl.download([yt_url])
except youtube_dl.utils.ExtractorError as err:
raise gr.Error(str(err))
@spaces.GPU
def yt_transcribe(yt_url, max_filesize=75.0):
html_embed_str = _return_yt_html_embed(yt_url)
with tempfile.TemporaryDirectory() as tmpdirname:
filepath = os.path.join(tmpdirname, "video.mp4")
download_yt_audio(yt_url, filepath)
with open(filepath, "rb") as f:
inputs = f.read()
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
return html_embed_str, text
demo = gr.Blocks(theme=gr.themes.Ocean())
mf_transcribe = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources="microphone", type="filepath"),
outputs="text",
title="清華大學多模態課程&廖老師嫡傳弟子-第二組 「語音轉文字」模型",
description=(
"只需點擊一下按鈕,即可轉錄長篇的麥克風或音訊輸入!此示範使用"
f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的音訊檔案。"
),
allow_flagging="never",
)
file_transcribe = gr.Interface(
fn=transcribe,
inputs=gr.Audio(sources="upload", type="filepath", label="音訊檔案"),
outputs="text",
title="Whisper Large V3: 音訊轉錄",
description=(
"只需點擊一下按鈕,即可轉錄長篇的麥克風或音訊輸入!此示範使用"
f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的音訊檔案。"
),
allow_flagging="never",
)
yt_transcribe_interface = gr.Interface(
fn=yt_transcribe,
inputs=gr.Textbox(lines=1, placeholder="在此貼上YouTube影片的URL", label="YouTube URL"),
outputs=["html", "text"],
title="Whisper Large V3: YouTube轉錄",
description=(
"只需點擊一下按鈕,即可轉錄長篇的YouTube影片!此示範使用"
f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的影片檔案。"
),
allow_flagging="never",
)
with demo:
gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe_interface], ["麥克風", "音訊檔案", "YouTube"])
demo.queue().launch(ssr_mode=False) |