import spaces import torch import gradio as gr import yt_dlp as youtube_dl from transformers import pipeline from transformers.pipelines.audio_utils import ffmpeg_read import tempfile import os import openai import requests import json import time # Program 1 constants MODEL_NAME = "openai/whisper-large-v3-turbo" BATCH_SIZE = 8 FILE_LIMIT_MB = 1000 YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files device = 0 if torch.cuda.is_available() else "cpu" pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device, ) # OpenAI API key api_key = os.getenv('OenAI_API') if not api_key: raise ValueError("請設置 'OPENAI_API_KEY' 環境變數") openai_api_key = api_key # Function to handle transcription and share text with other tabs def transcribe(inputs): if inputs is None: raise gr.Error("未提供音訊檔案!請在提交請求前上傳或錄製一個音訊檔案。") if isinstance(inputs, tuple): audio_data, sampling_rate = inputs inputs = {"array": audio_data, "sampling_rate": sampling_rate} elif isinstance(inputs, str): # inputs 是 filepath,讀取檔案 with open(inputs, "rb") as f: inputs = f.read() # 使用 ffmpeg_read 來解析音訊 inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate) inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate} else: raise gr.Error("無法識別的音訊格式。") # 使用 pipeline 來轉錄音訊 text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"] return text # Enhanced translation function with retry logic def translate_text(text, target_language): if not text: return "無轉錄結果,請在前面步驟中提供音訊。" prompt_instruction = f"請將以下文字翻譯成{target_language}:\n'{text}'" url = "https://api.openai.com/v1/chat/completions" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {openai_api_key}" } data = { "model": "gpt-4o", "messages": [{"role": "user", "content": prompt_instruction}], "max_tokens": 500 } # Retry logic for handling rate limits (429 errors) for attempt in range(5): # Retry up to 5 times try: response = requests.post(url, headers=headers, data=json.dumps(data)) response.raise_for_status() # 檢查請求是否成功 response_json = response.json() # 檢查 API 回應是否包含翻譯結果 if 'choices' in response_json and len(response_json['choices']) > 0: return response_json['choices'][0]['message']['content'] else: return "翻譯出錯,無法獲取翻譯結果。請再試一次。" except requests.exceptions.HTTPError as http_err: if response.status_code == 429: # API rate limit reached, wait and retry time.sleep(5) # 等待 5 秒後重試 continue return f"HTTP error occurred: {http_err}" except Exception as err: return f"發生錯誤:{err}" return "超過重試次數,請稍後再試。" # Gradio Interface demo = gr.Blocks(theme=gr.themes.Ocean()) # 設置狀態變數用於跨分頁共享數據 state = gr.State() # 麥克風錄音轉文字 def transcribe_and_save(inputs): text = transcribe(inputs) state.value = text # 將文字存儲到 state return text mf_transcribe = gr.Interface( fn=transcribe_and_save, inputs=gr.Audio(sources="microphone", type="numpy"), # 確保錄音文件格式為numpy數據 outputs="text", title="清華多模態Team2_語音轉文字_即時錄音", description="使用麥克風錄音轉錄為文字", allow_flagging="never", ) # 音訊檔案上傳轉文字 file_transcribe = gr.Interface( fn=transcribe_and_save, inputs=gr.Audio(sources="upload", type="filepath", label="音訊檔案"), outputs="text", # 只輸出轉錄結果 title="清華多模態Team2_語音轉文字_音訊檔案上傳轉錄文字", description="上傳音訊檔案轉錄為文字", allow_flagging="never", ) # 聊天與翻譯的幫手 def show_transcription_and_translate(language): if state.value: translated_text = translate_text(state.value, language) return f"轉錄結果:\n'{state.value}'\n翻譯為 {language}:\n{translated_text}" else: return "無轉錄結果,請在前面步驟中提供音訊。" chat_interface = gr.Interface( fn=show_transcription_and_translate, inputs=gr.Dropdown(["English", "Chinese", "Japanese", "Spanish"], label="選擇目標語言"), outputs="text", title="翻譯小助手", description="呈現前面聲音轉錄文字的結果,並自動翻譯成所選語言", ) with demo: gr.TabbedInterface( [mf_transcribe, file_transcribe, chat_interface], ["麥克風轉錄", "音訊檔案轉錄", "翻譯小助手"] ) demo.queue().launch(ssr_mode=False)