whisper_audio-text-translate

Sleeping

File size: 4,691 Bytes

7b70f57
2106f78
7b70f57
2106f78
7b70f57
2106f78
7b70f57
 
 
e4531cf
2106f78
7b70f57
2106f78
7b70f57
ee7903b
7b70f57
2106f78
 
 
 
 
 
 
 
 
7b70f57
 
ee7903b
7b70f57
ee7903b
2106f78
ee7903b
52007c6
2106f78
ee7903b
7b70f57
 
 
 
 
 
 
e4531cf
7b70f57
 
ee7903b
2106f78
7b70f57
 
 
ee7903b
7b70f57
 
 
ee7903b
7b70f57
 
 
 
 
ee7903b
7b70f57
 
 
ee7903b
 
7b70f57
ee7903b
7b70f57
 
 
 
 
 
 
ee7903b
7b70f57
 
 
 
 
 
 
 
 
 
 
ee7903b
7b70f57
 
 
 
ee7903b
cfbec94
7b70f57
 
ee7903b
7b70f57
ee7903b
7b70f57
ee7903b
 
7b70f57
 
 
 
 
 
ee7903b
7b70f57
ee7903b
7b70f57
ee7903b
 
7b70f57
 
 
 
ee7903b
7b70f57
ee7903b
7b70f57
ee7903b
7b70f57
ee7903b
 
7b70f57
 
 
 
 
ee7903b
7b70f57

import spaces
import torch

import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read

import tempfile
import os

MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files

device = 0 if torch.cuda.is_available() else "cpu"

pipe = pipeline(
    task="automatic-speech-recognition",
    model=MODEL_NAME,
    chunk_length_s=30,
    device=device,
)


@spaces.GPU
def transcribe(inputs):
    if inputs is None:
        raise gr.Error("未提供音訊檔案！請在提交請求前上傳或錄製一個音訊檔案。")

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
    return text


def _return_yt_html_embed(yt_url):
    video_id = yt_url.split("?v=")[-1]
    HTML_str = (
        f'<center> <iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"> </iframe>'
        " </center>"
    )
    return HTML_str

def download_yt_audio(yt_url, filename):
    info_loader = youtube_dl.YoutubeDL()

    try:
        info = info_loader.extract_info(yt_url, download=False)
    except youtube_dl.utils.DownloadError as err:
        raise gr.Error(str(err))

    file_length = info["duration_string"]
    file_h_m_s = file_length.split(":")
    file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]

    if len(file_h_m_s) == 1:
        file_h_m_s.insert(0, 0)
    if len(file_h_m_s) == 2:
        file_h_m_s.insert(0, 0)
    file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]

    if file_length_s > YT_LENGTH_LIMIT_S:
        yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
        file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
        raise gr.Error(f"最大YouTube影片長度為 {yt_length_limit_hms}，但提供的影片長度為 {file_length_hms}。")

    ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}

    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        try:
            ydl.download([yt_url])
        except youtube_dl.utils.ExtractorError as err:
            raise gr.Error(str(err))

@spaces.GPU
def yt_transcribe(yt_url, max_filesize=75.0):
    html_embed_str = _return_yt_html_embed(yt_url)

    with tempfile.TemporaryDirectory() as tmpdirname:
        filepath = os.path.join(tmpdirname, "video.mp4")
        download_yt_audio(yt_url, filepath)
        with open(filepath, "rb") as f:
            inputs = f.read()

    inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
    inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}

    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]

    return html_embed_str, text


demo = gr.Blocks(theme=gr.themes.Ocean())

mf_transcribe = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources="microphone", type="filepath"),
    outputs="text",
    title="清華大學多模態課程＆廖老師嫡傳弟子-第二組 「語音轉文字」模型",
    description=(
        "只需點擊一下按鈕，即可轉錄長篇的麥克風或音訊輸入！此示範使用"
        f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的音訊檔案。"
    ),
    allow_flagging="never",
)

file_transcribe = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(sources="upload", type="filepath", label="音訊檔案"),
    outputs="text",
    title="Whisper Large V3: 音訊轉錄",
    description=(
        "只需點擊一下按鈕，即可轉錄長篇的麥克風或音訊輸入！此示範使用"
        f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的音訊檔案。"
    ),
    allow_flagging="never",
)

yt_transcribe_interface = gr.Interface(
    fn=yt_transcribe,
    inputs=gr.Textbox(lines=1, placeholder="在此貼上YouTube影片的URL", label="YouTube URL"),
    outputs=["html", "text"],
    title="Whisper Large V3: YouTube轉錄",
    description=(
        "只需點擊一下按鈕，即可轉錄長篇的YouTube影片！此示範使用"
        f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的影片檔案。"
    ),
    allow_flagging="never",
)

with demo:
    gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe_interface], ["麥克風", "音訊檔案", "YouTube"])

demo.queue().launch(ssr_mode=False)