whisper_audio-text-translate

Sleeping

App Files Files Community

Marathon23 commited on Oct 22, 2024

Commit

ee7903b

verified ·

1 Parent(s): 080c996

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -71

app.py CHANGED Viewed

@@ -8,15 +8,11 @@ from transformers.pipelines.audio_utils import ffmpeg_read
 import tempfile
 import os
-import openai
-# 設定 OpenAI API 金鑰（請替換為您自己的 API 金鑰）
-openai.api_key = "YOUR_OPENAI_API_KEY"
 MODEL_NAME = "openai/whisper-large-v3-turbo"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
-YT_LENGTH_LIMIT_S = 3600  # 限制 YouTube 檔案為 1 小時
 device = 0 if torch.cuda.is_available() else "cpu"
@@ -27,34 +23,16 @@ pipe = pipeline(
     device=device,
 )
-def translate_text(input_text, target_language):
-    prompt = f"請將以下文字翻譯成{target_language}：\n\n{input_text}"
-    try:
-        response = openai.ChatCompletion.create(
-            model="gpt-4o",
-            messages=[
-                {"role": "user", "content": prompt}
-            ]
-        )
-        translated_text = response['choices'][0]['message']['content'].strip()
-        return translated_text
-    except Exception as e:
-        raise gr.Error(f"翻譯過程中出現錯誤：{str(e)}")
 @spaces.GPU
-def transcribe(inputs, task, translate_option, target_language):
     if inputs is None:
-        raise gr.Error("未提交音訊檔案！請在提交請求前上傳或錄製音訊檔案。")
-    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
-    text = result["text"]
-    if translate_option == "是":
-        text = translate_text(text, target_language)
     return text
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
@@ -65,29 +43,29 @@ def _return_yt_html_embed(yt_url):
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
     try:
         info = info_loader.extract_info(yt_url, download=False)
     except youtube_dl.utils.DownloadError as err:
         raise gr.Error(str(err))
     file_length = info["duration_string"]
     file_h_m_s = file_length.split(":")
     file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
     if len(file_h_m_s) == 1:
         file_h_m_s.insert(0, 0)
     if len(file_h_m_s) == 2:
         file_h_m_s.insert(0, 0)
     file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
     if file_length_s > YT_LENGTH_LIMIT_S:
         yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
         file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
-        raise gr.Error(f"最大 YouTube 長度為 {yt_length_limit_hms}，但獲得了長度為 {file_length_hms} 的影片。")
     ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         try:
             ydl.download([yt_url])
@@ -95,7 +73,7 @@ def download_yt_audio(yt_url, filename):
             raise gr.Error(str(err))
 @spaces.GPU
-def yt_transcribe(yt_url, task, translate_option, target_language, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
@@ -107,70 +85,50 @@ def yt_transcribe(yt_url, task, translate_option, target_language, max_filesize=
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
-    result = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)
-    text = result["text"]
-    if translate_option == "是":
-        text = translate_text(text, target_language)
     return html_embed_str, text
-demo = gr.Blocks(theme=gr.themes.Ocean())
-language_options = ["英文", "日文", "法文", "德文", "西班牙文", "繁體中文", "簡體中文", "越南文", "泰文"]
 mf_transcribe = gr.Interface(
     fn=transcribe,
-    inputs=[
-        gr.Audio(sources="microphone", type="filepath"),
-        gr.Radio(["transcribe", "translate"], label="任務", value="transcribe"),
-        gr.Radio(["是", "否"], label="是否翻譯轉錄結果", value="否"),
-        gr.Dropdown(language_options, label="目標語言", value="英文")
-    ],
     outputs="text",
-    title="清華大學多模態課程＆廖老師嫡傳弟子-第二組 「語音轉文字」model",
     description=(
-        "只需點擊一下按鈕，即可轉錄長篇的麥克風或音訊輸入！演示使用了"
-        f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的音訊文件。"
     ),
     allow_flagging="never",
 )
 file_transcribe = gr.Interface(
     fn=transcribe,
-    inputs=[
-        gr.Audio(sources="upload", type="filepath", label="音訊檔案"),
-        gr.Radio(["transcribe", "translate"], label="任務", value="transcribe"),
-        gr.Radio(["是", "否"], label="是否翻譯轉錄結果", value="否"),
-        gr.Dropdown(language_options, label="目標語言", value="英文")
-    ],
     outputs="text",
-    title="清華大學多模態課程＆廖老師嫡傳弟子-第二組 「語音轉文字」model：上傳音檔",
     description=(
-        "只需點擊一下按鈕，即可轉錄長篇的麥克風或音訊輸入！演示使用了"
-        f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的音訊文件。"
     ),
     allow_flagging="never",
 )
-yt_transcribe = gr.Interface(
     fn=yt_transcribe,
-    inputs=[
-        gr.Textbox(lines=1, placeholder="在此處貼上 YouTube 視頻的 URL", label="YouTube URL"),
-        gr.Radio(["transcribe", "translate"], label="任務", value="transcribe"),
-        gr.Radio(["是", "否"], label="是否翻譯轉錄結果", value="否"),
-        gr.Dropdown(language_options, label="目標語言", value="英文")
-    ],
     outputs=["html", "text"],
-    title="清華大學多模態課程＆廖老師嫡傳弟子-第二組 「語音轉文字」model: 轉錄 YouTube",
     description=(
-        "只需點擊一下按鈕，即可轉錄長篇的 YouTube 視頻！演示使用了"
-        f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的視頻文件。"
     ),
     allow_flagging="never",
 )
 with demo:
-    gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["麥克風", "音訊檔案", "YouTube"])
 demo.queue().launch(ssr_mode=False)

 import tempfile
 import os
 MODEL_NAME = "openai/whisper-large-v3-turbo"
 BATCH_SIZE = 8
 FILE_LIMIT_MB = 1000
+YT_LENGTH_LIMIT_S = 3600  # limit to 1 hour YouTube files
 device = 0 if torch.cuda.is_available() else "cpu"
     device=device,
 )
 @spaces.GPU
+def transcribe(inputs):
     if inputs is None:
+        raise gr.Error("未提供音訊檔案！請在提交請求前上傳或錄製一個音訊檔案。")
+    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
     return text
 def _return_yt_html_embed(yt_url):
     video_id = yt_url.split("?v=")[-1]
     HTML_str = (
 def download_yt_audio(yt_url, filename):
     info_loader = youtube_dl.YoutubeDL()
     try:
         info = info_loader.extract_info(yt_url, download=False)
     except youtube_dl.utils.DownloadError as err:
         raise gr.Error(str(err))
     file_length = info["duration_string"]
     file_h_m_s = file_length.split(":")
     file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
     if len(file_h_m_s) == 1:
         file_h_m_s.insert(0, 0)
     if len(file_h_m_s) == 2:
         file_h_m_s.insert(0, 0)
     file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
     if file_length_s > YT_LENGTH_LIMIT_S:
         yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
         file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
+        raise gr.Error(f"最大YouTube影片長度為 {yt_length_limit_hms}，但提供的影片長度為 {file_length_hms}。")
     ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
     with youtube_dl.YoutubeDL(ydl_opts) as ydl:
         try:
             ydl.download([yt_url])
             raise gr.Error(str(err))
 @spaces.GPU
+def yt_transcribe(yt_url, max_filesize=75.0):
     html_embed_str = _return_yt_html_embed(yt_url)
     with tempfile.TemporaryDirectory() as tmpdirname:
     inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
     inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
+    text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
     return html_embed_str, text
+demo = gr.Blocks(theme=gr.themes.Ocean())
 mf_transcribe = gr.Interface(
     fn=transcribe,
+    inputs=gr.Audio(sources="microphone", type="filepath"),
     outputs="text",
+    title="清華大學多模態課程＆廖老師嫡傳弟子-第二組 「語音轉文字」模型",
     description=(
+        "只需點擊一下按鈕，即可轉錄長篇的麥克風或音訊輸入！此示範使用"
+        f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的音訊檔案。"
     ),
     allow_flagging="never",
 )
 file_transcribe = gr.Interface(
     fn=transcribe,
+    inputs=gr.Audio(sources="upload", type="filepath", label="音訊檔案"),
     outputs="text",
+    title="Whisper Large V3: 音訊轉錄",
     description=(
+        "只需點擊一下按鈕，即可轉錄長篇的麥克風或音訊輸入！此示範使用"
+        f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的音訊檔案。"
     ),
     allow_flagging="never",
 )
+yt_transcribe_interface = gr.Interface(
     fn=yt_transcribe,
+    inputs=gr.Textbox(lines=1, placeholder="在此貼上YouTube影片的URL", label="YouTube URL"),
     outputs=["html", "text"],
+    title="Whisper Large V3: YouTube轉錄",
     description=(
+        "只需點擊一下按鈕，即可轉錄長篇的YouTube影片！此示範使用"
+        f"檢查點 [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) 和 🤗 Transformers 來轉錄任意長度的影片檔案。"
     ),
     allow_flagging="never",
 )
 with demo:
+    gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe_interface], ["麥克風", "音訊檔案", "YouTube"])
 demo.queue().launch(ssr_mode=False)