Spaces:

Marathon23
/

whisper_audio-text-translate_v2

Sleeping

App Files Files Community

whisper_audio-text-translate_v2 / app.py

Marathon23

Update app.py

3f9e557 verified about 1 month ago

raw

history blame contribute delete

5.21 kB

	import spaces
	import torch
	import gradio as gr
	import yt_dlp as youtube_dl
	from transformers import pipeline
	from transformers.pipelines.audio_utils import ffmpeg_read
	import tempfile
	import os
	import openai
	import requests
	import json
	import time

	# Program 1 constants
	MODEL_NAME = "openai/whisper-large-v3-turbo"
	BATCH_SIZE = 8
	FILE_LIMIT_MB = 1000
	YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files

	device = 0 if torch.cuda.is_available() else "cpu"
	pipe = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=30,
	device=device,
	)

	# OpenAI API key
	api_key = os.getenv('OenAI_API')
	if not api_key:
	raise ValueError("請設置 'OPENAI_API_KEY' 環境變數")
	openai_api_key = api_key

	# Function to handle transcription and share text with other tabs
	def transcribe(inputs):
	if inputs is None:
	raise gr.Error("未提供音訊檔案！請在提交請求前上傳或錄製一個音訊檔案。")

	if isinstance(inputs, tuple):
	audio_data, sampling_rate = inputs
	inputs = {"array": audio_data, "sampling_rate": sampling_rate}
	elif isinstance(inputs, str):
	# inputs 是 filepath，讀取檔案
	with open(inputs, "rb") as f:
	inputs = f.read()
	# 使用 ffmpeg_read 來解析音訊
	inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
	inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
	else:
	raise gr.Error("無法識別的音訊格式。")

	# 使用 pipeline 來轉錄音訊
	text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": "transcribe"}, return_timestamps=True)["text"]
	return text

	# Enhanced translation function with retry logic
	def translate_text(text, target_language):
	if not text:
	return "無轉錄結果，請在前面步驟中提供音訊。"

	prompt_instruction = f"請將以下文字翻譯成{target_language}：\n'{text}'"
	url = "https://api.openai.com/v1/chat/completions"
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {openai_api_key}"
	}

	data = {
	"model": "gpt-4o",
	"messages": [{"role": "user", "content": prompt_instruction}],
	"max_tokens": 500
	}

	# Retry logic for handling rate limits (429 errors)
	for attempt in range(5): # Retry up to 5 times
	try:
	response = requests.post(url, headers=headers, data=json.dumps(data))
	response.raise_for_status() # 檢查請求是否成功
	response_json = response.json()

	# 檢查 API 回應是否包含翻譯結果
	if 'choices' in response_json and len(response_json['choices']) > 0:
	return response_json['choices'][0]['message']['content']
	else:
	return "翻譯出錯，無法獲取翻譯結果。請再試一次。"

	except requests.exceptions.HTTPError as http_err:
	if response.status_code == 429:
	# API rate limit reached, wait and retry
	time.sleep(5) # 等待 5 秒後重試
	continue
	return f"HTTP error occurred: {http_err}"
	except Exception as err:
	return f"發生錯誤：{err}"

	return "超過重試次數，請稍後再試。"

	# Gradio Interface
	demo = gr.Blocks(theme=gr.themes.Ocean())

	# 設置狀態變數用於跨分頁共享數據
	state = gr.State()

	# 麥克風錄音轉文字
	def transcribe_and_save(inputs):
	text = transcribe(inputs)
	state.value = text # 將文字存儲到 state
	return text

	mf_transcribe = gr.Interface(
	fn=transcribe_and_save,
	inputs=gr.Audio(sources="microphone", type="numpy"), # 確保錄音文件格式為numpy數據
	outputs="text",
	title="清華多模態Team2_語音轉文字_即時錄音",
	description="使用麥克風錄音轉錄為文字",
	allow_flagging="never",
	)

	# 音訊檔案上傳轉文字
	file_transcribe = gr.Interface(
	fn=transcribe_and_save,
	inputs=gr.Audio(sources="upload", type="filepath", label="音訊檔案"),
	outputs="text", # 只輸出轉錄結果
	title="清華多模態Team2_語音轉文字_音訊檔案上傳轉錄文字",
	description="上傳音訊檔案轉錄為文字",
	allow_flagging="never",
	)

	# 聊天與翻譯的幫手
	def show_transcription_and_translate(language):
	if state.value:
	translated_text = translate_text(state.value, language)
	return f"轉錄結果：\n'{state.value}'\n翻譯為 {language}：\n{translated_text}"
	else:
	return "無轉錄結果，請在前面步驟中提供音訊。"

	chat_interface = gr.Interface(
	fn=show_transcription_and_translate,
	inputs=gr.Dropdown(["English", "Chinese", "Japanese", "Spanish"], label="選擇目標語言"),
	outputs="text",
	title="翻譯小助手",
	description="呈現前面聲音轉錄文字的結果，並自動翻譯成所選語言",
	)

	with demo:
	gr.TabbedInterface(
	[mf_transcribe, file_transcribe, chat_interface],
	["麥克風轉錄", "音訊檔案轉錄", "翻譯小助手"]
	)

	demo.queue().launch(ssr_mode=False)