|
import spaces |
|
import torch |
|
import gradio as gr |
|
import yt_dlp as youtube_dl |
|
from transformers import pipeline |
|
from transformers.pipelines.audio_utils import ffmpeg_read |
|
import tempfile |
|
import os |
|
import openai |
|
import requests |
|
import json |
|
import time |
|
|
|
|
|
MODEL_NAME = "openai/whisper-large-v3-turbo" |
|
BATCH_SIZE = 8 |
|
FILE_LIMIT_MB = 1000 |
|
YT_LENGTH_LIMIT_S = 3600 |
|
|
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
pipe = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME, |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
|
|
api_key = os.getenv('OenAI_API') |
|
if not api_key: |
|
raise ValueError("請設置 'OPENAI_API_KEY' 環境變數") |
|
openai_api_key = api_key |
|
|
|
|
|
def transcribe(inputs): |
|
if inputs is None: |
|
raise gr.Error("未提供音訊檔案!請在提交請求前上傳或錄製一個音訊檔案。") |
|
|
|
if isinstance(inputs, tuple): |
|
audio_data, sampling_rate = inputs |
|
inputs = {"array": audio_data, "sampling_rate": sampling_rate} |
|
elif isinstance(inputs, str): |
|
|
|
with open(inputs, "rb") as f: |
|
inputs = f.read() |
|
|
|
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate) |
|
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate} |
|
else: |
|
raise gr.Error("無法識別的音訊格式。") |
|
|
|
|
|
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"] |
|
return text |
|
|
|
|
|
def translate_text(text, target_language): |
|
if not text: |
|
return "無轉錄結果,請在前面步驟中提供音訊。" |
|
|
|
prompt_instruction = f"請將以下文字翻譯成{target_language}:\n'{text}'" |
|
url = "https://api.openai.com/v1/chat/completions" |
|
headers = { |
|
"Content-Type": "application/json", |
|
"Authorization": f"Bearer {openai_api_key}" |
|
} |
|
|
|
data = { |
|
"model": "gpt-4o", |
|
"messages": [{"role": "user", "content": prompt_instruction}], |
|
"max_tokens": 500 |
|
} |
|
|
|
|
|
for attempt in range(5): |
|
try: |
|
response = requests.post(url, headers=headers, data=json.dumps(data)) |
|
response.raise_for_status() |
|
response_json = response.json() |
|
|
|
|
|
if 'choices' in response_json and len(response_json['choices']) > 0: |
|
return response_json['choices'][0]['message']['content'] |
|
else: |
|
return "翻譯出錯,無法獲取翻譯結果。請再試一次。" |
|
|
|
except requests.exceptions.HTTPError as http_err: |
|
if response.status_code == 429: |
|
|
|
time.sleep(5) |
|
continue |
|
return f"HTTP error occurred: {http_err}" |
|
except Exception as err: |
|
return f"發生錯誤:{err}" |
|
|
|
return "超過重試次數,請稍後再試。" |
|
|
|
|
|
demo = gr.Blocks(theme=gr.themes.Ocean()) |
|
|
|
|
|
state = gr.State() |
|
|
|
|
|
def transcribe_and_save(inputs): |
|
text = transcribe(inputs) |
|
state.value = text |
|
return text |
|
|
|
mf_transcribe = gr.Interface( |
|
fn=transcribe_and_save, |
|
inputs=gr.Audio(sources="microphone", type="numpy"), |
|
outputs="text", |
|
title="清華多模態Team2_語音轉文字_即時錄音", |
|
description="使用麥克風錄音轉錄為文字", |
|
allow_flagging="never", |
|
) |
|
|
|
|
|
file_transcribe = gr.Interface( |
|
fn=transcribe_and_save, |
|
inputs=gr.Audio(sources="upload", type="filepath", label="音訊檔案"), |
|
outputs="text", |
|
title="清華多模態Team2_語音轉文字_音訊檔案上傳轉錄文字", |
|
description="上傳音訊檔案轉錄為文字", |
|
allow_flagging="never", |
|
) |
|
|
|
|
|
def show_transcription_and_translate(language): |
|
if state.value: |
|
translated_text = translate_text(state.value, language) |
|
return f"轉錄結果:\n'{state.value}'\n翻譯為 {language}:\n{translated_text}" |
|
else: |
|
return "無轉錄結果,請在前面步驟中提供音訊。" |
|
|
|
chat_interface = gr.Interface( |
|
fn=show_transcription_and_translate, |
|
inputs=gr.Dropdown(["English", "Chinese", "Japanese", "Spanish"], label="選擇目標語言"), |
|
outputs="text", |
|
title="翻譯小助手", |
|
description="呈現前面聲音轉錄文字的結果,並自動翻譯成所選語言", |
|
) |
|
|
|
with demo: |
|
gr.TabbedInterface( |
|
[mf_transcribe, file_transcribe, chat_interface], |
|
["麥克風轉錄", "音訊檔案轉錄", "翻譯小助手"] |
|
) |
|
|
|
demo.queue().launch(ssr_mode=False) |