Marathon23's picture
Update app.py
3f9e557 verified
import spaces
import torch
import gradio as gr
import yt_dlp as youtube_dl
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
import tempfile
import os
import openai
import requests
import json
import time
# Program 1 constants
MODEL_NAME = "openai/whisper-large-v3-turbo"
BATCH_SIZE = 8
FILE_LIMIT_MB = 1000
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
device = 0 if torch.cuda.is_available() else "cpu"
pipe = pipeline(
task="automatic-speech-recognition",
model=MODEL_NAME,
chunk_length_s=30,
device=device,
)
# OpenAI API key
api_key = os.getenv('OenAI_API')
if not api_key:
raise ValueError("請設置 'OPENAI_API_KEY' 環境變數")
openai_api_key = api_key
# Function to handle transcription and share text with other tabs
def transcribe(inputs):
if inputs is None:
raise gr.Error("未提供音訊檔案!請在提交請求前上傳或錄製一個音訊檔案。")
if isinstance(inputs, tuple):
audio_data, sampling_rate = inputs
inputs = {"array": audio_data, "sampling_rate": sampling_rate}
elif isinstance(inputs, str):
# inputs 是 filepath,讀取檔案
with open(inputs, "rb") as f:
inputs = f.read()
# 使用 ffmpeg_read 來解析音訊
inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
else:
raise gr.Error("無法識別的音訊格式。")
# 使用 pipeline 來轉錄音訊
text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
return text
# Enhanced translation function with retry logic
def translate_text(text, target_language):
if not text:
return "無轉錄結果,請在前面步驟中提供音訊。"
prompt_instruction = f"請將以下文字翻譯成{target_language}:\n'{text}'"
url = "https://api.openai.com/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {openai_api_key}"
}
data = {
"model": "gpt-4o",
"messages": [{"role": "user", "content": prompt_instruction}],
"max_tokens": 500
}
# Retry logic for handling rate limits (429 errors)
for attempt in range(5): # Retry up to 5 times
try:
response = requests.post(url, headers=headers, data=json.dumps(data))
response.raise_for_status() # 檢查請求是否成功
response_json = response.json()
# 檢查 API 回應是否包含翻譯結果
if 'choices' in response_json and len(response_json['choices']) > 0:
return response_json['choices'][0]['message']['content']
else:
return "翻譯出錯,無法獲取翻譯結果。請再試一次。"
except requests.exceptions.HTTPError as http_err:
if response.status_code == 429:
# API rate limit reached, wait and retry
time.sleep(5) # 等待 5 秒後重試
continue
return f"HTTP error occurred: {http_err}"
except Exception as err:
return f"發生錯誤:{err}"
return "超過重試次數,請稍後再試。"
# Gradio Interface
demo = gr.Blocks(theme=gr.themes.Ocean())
# 設置狀態變數用於跨分頁共享數據
state = gr.State()
# 麥克風錄音轉文字
def transcribe_and_save(inputs):
text = transcribe(inputs)
state.value = text # 將文字存儲到 state
return text
mf_transcribe = gr.Interface(
fn=transcribe_and_save,
inputs=gr.Audio(sources="microphone", type="numpy"), # 確保錄音文件格式為numpy數據
outputs="text",
title="清華多模態Team2_語音轉文字_即時錄音",
description="使用麥克風錄音轉錄為文字",
allow_flagging="never",
)
# 音訊檔案上傳轉文字
file_transcribe = gr.Interface(
fn=transcribe_and_save,
inputs=gr.Audio(sources="upload", type="filepath", label="音訊檔案"),
outputs="text", # 只輸出轉錄結果
title="清華多模態Team2_語音轉文字_音訊檔案上傳轉錄文字",
description="上傳音訊檔案轉錄為文字",
allow_flagging="never",
)
# 聊天與翻譯的幫手
def show_transcription_and_translate(language):
if state.value:
translated_text = translate_text(state.value, language)
return f"轉錄結果:\n'{state.value}'\n翻譯為 {language}:\n{translated_text}"
else:
return "無轉錄結果,請在前面步驟中提供音訊。"
chat_interface = gr.Interface(
fn=show_transcription_and_translate,
inputs=gr.Dropdown(["English", "Chinese", "Japanese", "Spanish"], label="選擇目標語言"),
outputs="text",
title="翻譯小助手",
description="呈現前面聲音轉錄文字的結果,並自動翻譯成所選語言",
)
with demo:
gr.TabbedInterface(
[mf_transcribe, file_transcribe, chat_interface],
["麥克風轉錄", "音訊檔案轉錄", "翻譯小助手"]
)
demo.queue().launch(ssr_mode=False)