awacke1's picture
Update app.py
c381bf2 verified
raw
history blame
14.3 kB
import os
import gradio as gr
import openai as o
import base64
import fitz # PyMuPDF
import cv2
from moviepy.video.io.VideoFileClip import VideoFileClip
import json
import requests
import re
from io import BytesIO
from PIL import Image
from pathlib import Path
# 📜 CONFIG
UI_TITLE = "✨🧙‍♂️🔮 GPT-4o Omni-Oracle"
KEY_FILE = "key.txt"
STATE_FILE = "app_state.json"
MODELS = {
"GPT-4o ✨": "gpt-4o",
"o3 (Advanced Reasoning) �": "gpt-4-turbo", # Placeholder
"o4-mini (Fastest) ⚡": "gpt-4-turbo", # Placeholder
"o4-mini-high (Vision) 👁️‍🗨️": "gpt-4o", # Placeholder
"GPT-4.5 (Research) 🔬": "gpt-4-turbo-preview", # Placeholder
"GPT-4.1 (Analysis) 💻": "gpt-4-turbo", # Placeholder
"GPT-4.1-mini (Everyday) ☕": "gpt-4-turbo", # Placeholder
"GPT-4 Turbo 🚀": "gpt-4-turbo",
"GPT-3.5 Turbo ⚡": "gpt-3.5-turbo",
}
VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"]
TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"]
FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"]
LANGUAGES = {
"🇬🇧 English": "English", "🇨🇳 Chinese": "Chinese", "🇫🇷 French": "French", "🇩🇪 German": "German",
"🇮🇱 Hebrew": "Hebrew", "🇮🇳 Hindi": "Hindi", "🇯🇵 Japanese": "Japanese", "🇳🇿 Maori": "Maori",
"🇷🇺 Russian": "Russian", "🇪🇸 Spanish": "Spanish"
}
# 🎨 STYLE
H1 = "# <font size='7'>{0}</font>"
H2 = "## <font size='6'>{0}</font>"
# 🪄 HELPERS, LORE & AUTOSAVE RITUALS
def save_state(data: dict):
"""A rune that inscribes the session's memory onto a JSON scroll."""
with open(STATE_FILE, 'w') as f:
json.dump(data, f, indent=4)
def load_state() -> dict:
"""A ritual to recall the session's memory from the JSON scroll."""
if os.path.exists(STATE_FILE):
with open(STATE_FILE, 'r') as f:
try:
return json.load(f)
except json.JSONDecodeError:
return {}
return {}
def update_and_save(key: str, value, state: dict):
"""A binding spell that updates a memory and immediately inscribes it."""
state[key] = value
save_state(state)
return state
def save_key(k: str) -> str:
"💾🔑 A rune to bind the Eldritch Key."
if not k or not k.strip(): return "🚫 Empty Key"
with open(KEY_FILE, "w") as f: f.write(k.strip())
return "🔑✅ Key Saved!"
def get_key(k: str) -> str:
"📜🔑 A ritual to summon the Eldritch Key."
k = k.strip() if k and k.strip() else (open(KEY_FILE).read().strip() if os.path.exists(KEY_FILE) else os.getenv("OPENAI_KEY", ""))
if not k: raise gr.Error("❗🔑 An Eldritch Key (OpenAI API Key) is required.")
o.api_key = k
return k
def file_to_base64(file_path):
with open(file_path, "rb") as f:
return base64.b64encode(f.read()).decode('utf-8')
def invoke_oracle(scribe_key: str, model_name: str, system_prompt: str, user_content: list, history: list):
get_key(scribe_key)
messages = history + [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}]
try:
prophecy = o.chat.completions.create(model=model_name, messages=messages, stream=True)
history.append({"role": "user", "content": "..."})
history.append({"role": "assistant", "content": ""})
for chunk in prophecy:
if chunk.choices[0].delta.content:
history[-1]['content'] += chunk.choices[0].delta.content
yield history
except Exception as e:
yield history + [{"role": "assistant", "content": f"🧙‍♂️🔮 A magical disturbance occurred: {str(e)}"}]
# --- Modality-Specific Summoning Rituals ---
def summon_vision_from_image(api_key, model, prompt, image_path, history):
if image_path is None: raise gr.Error("An image must be provided.")
b64_image = file_to_base64(image_path.name)
user_content = [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}"}}]
yield from invoke_oracle(api_key, model, "You are an assistant that analyzes images. Respond in Markdown.", user_content, history)
def summon_echo_from_audio(api_key, model, prompt, audio_path, history):
if audio_path is None: raise gr.Error("An audio file must be provided.")
get_key(api_key)
with open(audio_path.name, "rb") as audio_file:
transcription = o.audio.transcriptions.create(model="whisper-1", file=audio_file)
full_prompt = f"{prompt}\n\n--- Transcription ---\n{transcription.text}"
yield from invoke_oracle(api_key, model, "You analyze audio transcripts. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history)
def summon_wisdom_from_text(api_key, model, prompt, file_path, history):
if file_path is None: raise gr.Error("A file must be provided.")
text_content = ""
if file_path.name.lower().endswith('.pdf'):
with fitz.open(file_path.name) as doc:
text_content = "".join(page.get_text() for page in doc)
else:
with open(file_path.name, 'r', encoding='utf-8') as f:
text_content = f.read()
full_prompt = f"{prompt}\n\n--- Document Content ---\n{text_content[:10000]}..."
yield from invoke_oracle(api_key, model, "You analyze documents. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history)
def summon_chronicle_from_video(api_key, model, prompt, video_path, history, progress=gr.Progress()):
if video_path is None: raise gr.Error("A video must be provided.")
get_key(api_key)
base_video_path, _ = os.path.splitext(video_path.name)
progress(0.1, desc="🔮 Extracting Audio...")
audio_path = f"{base_video_path}.mp3"
transcript_text = "No audio found."
try:
with VideoFileClip(video_path.name) as clip:
clip.audio.write_audiofile(audio_path, bitrate="32k", logger=None)
progress(0.3, desc="🎤 Transcribing Audio...")
with open(audio_path, "rb") as audio_file:
transcript_text = o.audio.transcriptions.create(model="whisper-1", file=audio_file).text
except Exception as e:
print(f"Audio failed: {e}")
progress(0.6, desc="🖼️ Sampling Frames...")
base64Frames = []
video = cv2.VideoCapture(video_path.name)
total_frames, fps = int(video.get(cv2.CAP_PROP_FRAME_COUNT)), video.get(cv2.CAP_PROP_FPS)
frames_to_skip = int(fps * 2)
for curr_frame in range(0, total_frames - 1, frames_to_skip):
if len(base64Frames) >= 10: break
video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
success, frame = video.read()
if not success: break
_, buffer = cv2.imencode(".jpg", frame)
base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
video.release()
progress(0.8, desc="🌀 Consulting Oracle...")
user_content = [{"type": "text", "text": f"{prompt}\n\n--- Audio Transcript ---\n{transcript_text}"}, *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)]
yield from invoke_oracle(api_key, model, "You are a video analyst. Respond in Markdown.", user_content, history)
def generate_speech(api_key, tts_model, voice, text, language, format, progress=gr.Progress()):
"""A ritual to give voice to the written word, in any tongue."""
get_key(api_key)
# Step 1: Translate the text if the language is not English
progress(0.2, desc=f"Translating to {language}...")
translated_text = text
if language != "English":
try:
response = o.chat.completions.create(
model="gpt-4o",
messages=[
{"role": "system", "content": f"You are a translator. Translate the following text to {language}. Output only the translated text."},
{"role": "user", "content": text}
],
temperature=0
)
translated_text = response.choices[0].message.content
except Exception as e:
raise gr.Error(f"Translation failed: {e}")
# Step 2: Generate speech from the (possibly translated) text
progress(0.6, desc="Summoning voice...")
speech_file_path = Path(__file__).parent / f"speech.{format}"
try:
response = o.audio.speech.create(
model=tts_model,
voice=voice,
input=translated_text,
response_format=format
)
response.stream_to_file(speech_file_path)
except Exception as e:
raise gr.Error(f"Speech generation failed: {e}")
progress(1.0, desc="Voice summoned!")
return str(speech_file_path), translated_text
# 🔮 UI
with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary_hue="orange")) as demo:
initial_state = load_state()
app_state = gr.State(initial_state)
gr.Markdown(H1.format(UI_TITLE))
with gr.Accordion("🔑 Eldritch Key & Oracle Selection", open=True):
with gr.Row():
api_key_box = gr.Textbox(label="🔑 Key", type="password", placeholder="sk-...", scale=3, value=initial_state.get('api_key', ''))
save_btn = gr.Button("💾", scale=1)
status_txt = gr.Textbox(interactive=False, scale=1, label="Status")
model_selector = gr.Dropdown(choices=list(MODELS.keys()), label="🔮 Oracle", value=initial_state.get('model', "GPT-4o ✨"))
save_btn.click(save_key, inputs=api_key_box, outputs=status_txt)
chatbot = gr.Chatbot(height=500, label="📜 Scroll of Conversation", type='messages', value=initial_state.get('chatbot', []))
with gr.Tabs():
with gr.TabItem("💬 Chat"):
text_prompt = gr.Textbox(label="Your Quest:", placeholder="Type your message...", value=initial_state.get('text_prompt', ''))
text_event = text_prompt.submit(fn=lambda api_key, model, prompt, hist: invoke_oracle(api_key, model, "You are a helpful AI assistant.", [{"type": "text", "text": prompt}], hist), inputs=[api_key_box, model_selector, text_prompt, chatbot], outputs=chatbot)
with gr.TabItem("🖼️ Image"):
with gr.Row():
image_input = gr.File(label="Upload Image", type="file")
image_output = gr.Image(label="Your Image", type="filepath", interactive=False)
image_prompt = gr.Textbox(label="Image Prompt:", value=initial_state.get('image_prompt', "What is in this image?"))
image_btn = gr.Button("👁️ Summon Vision")
image_input.change(lambda x: x, inputs=image_input, outputs=image_output)
image_event = image_btn.click(summon_vision_from_image, [api_key_box, model_selector, image_prompt, image_input, chatbot], chatbot)
with gr.TabItem("🎤 Audio"):
audio_input = gr.File(label="Upload Audio", type="file")
audio_prompt = gr.Textbox(label="Audio Prompt:", value=initial_state.get('audio_prompt', "Summarize this audio."))
audio_btn = gr.Button("🗣️ Summon Echo")
audio_event = audio_btn.click(summon_echo_from_audio, [api_key_box, model_selector, audio_prompt, audio_input, chatbot], chatbot)
with gr.TabItem("🎥 Video"):
video_input = gr.File(label="Upload Video", type="file")
video_prompt = gr.Textbox(label="Video Prompt:", value=initial_state.get('video_prompt', "Summarize this video."))
video_btn = gr.Button("🎬 Summon Chronicle")
video_event = video_btn.click(summon_chronicle_from_video, [api_key_box, model_selector, video_prompt, video_input, chatbot], chatbot)
with gr.TabItem("📄 Document"):
doc_input = gr.File(label="Upload PDF or TXT", type="file")
doc_prompt = gr.Textbox(label="Document Prompt:", value=initial_state.get('doc_prompt', "Summarize this document."))
doc_btn = gr.Button("📖 Summon Wisdom")
doc_event = doc_btn.click(summon_wisdom_from_text, [api_key_box, model_selector, doc_prompt, doc_input, chatbot], chatbot)
with gr.TabItem("🔊 Speech Synthesis"):
gr.Markdown(H2.format("Give Voice to Words"))
tts_language = gr.Radio(choices=list(LANGUAGES.keys()), label="🈯 Language", value=initial_state.get('tts_language', "🇬🇧 English"))
with gr.Row():
tts_voice = gr.Dropdown(choices=VOICES, label="🗣️ Voice", value=initial_state.get('tts_voice', "alloy"))
tts_model_select = gr.Dropdown(choices=TTS_MODELS, label="🧠 TTS Model", value=initial_state.get('tts_model', "gpt-4o-mini-tts"))
tts_format = gr.Dropdown(choices=FORMATS, label="📦 Format", value=initial_state.get('tts_format', "mp3"))
tts_text_input = gr.Textbox(label="📜 Text to Speak", lines=4, placeholder="Enter text here...", value=initial_state.get('tts_text', ''))
tts_btn = gr.Button("🔊 Generate Speech")
tts_translated_text = gr.Textbox(label="Translated Text (Output)", interactive=False)
tts_audio_output = gr.Audio(label="🎧 Spoken Word", type="filepath")
tts_event = tts_btn.click(generate_speech, [api_key_box, tts_model_select, tts_voice, tts_text_input, tts_language, tts_format], [tts_audio_output, tts_translated_text])
# --- Autosave Event Listeners ---
components_to_save = {
'api_key': api_key_box, 'model': model_selector, 'text_prompt': text_prompt,
'image_prompt': image_prompt, 'audio_prompt': audio_prompt, 'video_prompt': video_prompt,
'doc_prompt': doc_prompt, 'tts_language': tts_language, 'tts_voice': tts_voice,
'tts_model': tts_model_select, 'tts_format': tts_format, 'tts_text': tts_text_input
}
for key, component in components_to_save.items():
component.change(update_and_save, [gr.State(key), component, app_state], app_state)
for event in [text_event, image_event, audio_event, video_event, doc_event]:
event.then(lambda history, state: update_and_save('chatbot', history, state), [chatbot, app_state], app_state)
if __name__ == "__main__":
demo.launch(share=True, debug=True)