Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
import openai as o | |
import base64 | |
import fitz # PyMuPDF | |
import cv2 | |
from moviepy.video.io.VideoFileClip import VideoFileClip | |
import json | |
import requests | |
import re | |
from io import BytesIO | |
from PIL import Image | |
from pathlib import Path | |
# 📜 CONFIG | |
UI_TITLE = "✨🧙♂️🔮 GPT-4o Omni-Oracle" | |
KEY_FILE = "key.txt" | |
STATE_FILE = "app_state.json" | |
MODELS = { | |
"GPT-4o ✨": "gpt-4o", | |
"o3 (Advanced Reasoning) �": "gpt-4-turbo", # Placeholder | |
"o4-mini (Fastest) ⚡": "gpt-4-turbo", # Placeholder | |
"o4-mini-high (Vision) 👁️🗨️": "gpt-4o", # Placeholder | |
"GPT-4.5 (Research) 🔬": "gpt-4-turbo-preview", # Placeholder | |
"GPT-4.1 (Analysis) 💻": "gpt-4-turbo", # Placeholder | |
"GPT-4.1-mini (Everyday) ☕": "gpt-4-turbo", # Placeholder | |
"GPT-4 Turbo 🚀": "gpt-4-turbo", | |
"GPT-3.5 Turbo ⚡": "gpt-3.5-turbo", | |
} | |
VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"] | |
TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] | |
FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"] | |
LANGUAGES = { | |
"🇬🇧 English": "English", "🇨🇳 Chinese": "Chinese", "🇫🇷 French": "French", "🇩🇪 German": "German", | |
"🇮🇱 Hebrew": "Hebrew", "🇮🇳 Hindi": "Hindi", "🇯🇵 Japanese": "Japanese", "🇳🇿 Maori": "Maori", | |
"🇷🇺 Russian": "Russian", "🇪🇸 Spanish": "Spanish" | |
} | |
# 🎨 STYLE | |
H1 = "# <font size='7'>{0}</font>" | |
H2 = "## <font size='6'>{0}</font>" | |
# 🪄 HELPERS, LORE & AUTOSAVE RITUALS | |
def save_state(data: dict): | |
"""A rune that inscribes the session's memory onto a JSON scroll.""" | |
with open(STATE_FILE, 'w') as f: | |
json.dump(data, f, indent=4) | |
def load_state() -> dict: | |
"""A ritual to recall the session's memory from the JSON scroll.""" | |
if os.path.exists(STATE_FILE): | |
with open(STATE_FILE, 'r') as f: | |
try: | |
return json.load(f) | |
except json.JSONDecodeError: | |
return {} | |
return {} | |
def update_and_save(key: str, value, state: dict): | |
"""A binding spell that updates a memory and immediately inscribes it.""" | |
state[key] = value | |
save_state(state) | |
return state | |
def save_key(k: str) -> str: | |
"💾🔑 A rune to bind the Eldritch Key." | |
if not k or not k.strip(): return "🚫 Empty Key" | |
with open(KEY_FILE, "w") as f: f.write(k.strip()) | |
return "🔑✅ Key Saved!" | |
def get_key(k: str) -> str: | |
"📜🔑 A ritual to summon the Eldritch Key." | |
k = k.strip() if k and k.strip() else (open(KEY_FILE).read().strip() if os.path.exists(KEY_FILE) else os.getenv("OPENAI_KEY", "")) | |
if not k: raise gr.Error("❗🔑 An Eldritch Key (OpenAI API Key) is required.") | |
o.api_key = k | |
return k | |
def file_to_base64(file_path): | |
with open(file_path, "rb") as f: | |
return base64.b64encode(f.read()).decode('utf-8') | |
def invoke_oracle(scribe_key: str, model_name: str, system_prompt: str, user_content: list, history: list): | |
get_key(scribe_key) | |
messages = history + [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}] | |
try: | |
prophecy = o.chat.completions.create(model=model_name, messages=messages, stream=True) | |
history.append({"role": "user", "content": "..."}) | |
history.append({"role": "assistant", "content": ""}) | |
for chunk in prophecy: | |
if chunk.choices[0].delta.content: | |
history[-1]['content'] += chunk.choices[0].delta.content | |
yield history | |
except Exception as e: | |
yield history + [{"role": "assistant", "content": f"🧙♂️🔮 A magical disturbance occurred: {str(e)}"}] | |
# --- Modality-Specific Summoning Rituals --- | |
def summon_vision_from_image(api_key, model, prompt, image_path, history): | |
if image_path is None: raise gr.Error("An image must be provided.") | |
b64_image = file_to_base64(image_path.name) | |
user_content = [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}"}}] | |
yield from invoke_oracle(api_key, model, "You are an assistant that analyzes images. Respond in Markdown.", user_content, history) | |
def summon_echo_from_audio(api_key, model, prompt, audio_path, history): | |
if audio_path is None: raise gr.Error("An audio file must be provided.") | |
get_key(api_key) | |
with open(audio_path.name, "rb") as audio_file: | |
transcription = o.audio.transcriptions.create(model="whisper-1", file=audio_file) | |
full_prompt = f"{prompt}\n\n--- Transcription ---\n{transcription.text}" | |
yield from invoke_oracle(api_key, model, "You analyze audio transcripts. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history) | |
def summon_wisdom_from_text(api_key, model, prompt, file_path, history): | |
if file_path is None: raise gr.Error("A file must be provided.") | |
text_content = "" | |
if file_path.name.lower().endswith('.pdf'): | |
with fitz.open(file_path.name) as doc: | |
text_content = "".join(page.get_text() for page in doc) | |
else: | |
with open(file_path.name, 'r', encoding='utf-8') as f: | |
text_content = f.read() | |
full_prompt = f"{prompt}\n\n--- Document Content ---\n{text_content[:10000]}..." | |
yield from invoke_oracle(api_key, model, "You analyze documents. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history) | |
def summon_chronicle_from_video(api_key, model, prompt, video_path, history, progress=gr.Progress()): | |
if video_path is None: raise gr.Error("A video must be provided.") | |
get_key(api_key) | |
base_video_path, _ = os.path.splitext(video_path.name) | |
progress(0.1, desc="🔮 Extracting Audio...") | |
audio_path = f"{base_video_path}.mp3" | |
transcript_text = "No audio found." | |
try: | |
with VideoFileClip(video_path.name) as clip: | |
clip.audio.write_audiofile(audio_path, bitrate="32k", logger=None) | |
progress(0.3, desc="🎤 Transcribing Audio...") | |
with open(audio_path, "rb") as audio_file: | |
transcript_text = o.audio.transcriptions.create(model="whisper-1", file=audio_file).text | |
except Exception as e: | |
print(f"Audio failed: {e}") | |
progress(0.6, desc="🖼️ Sampling Frames...") | |
base64Frames = [] | |
video = cv2.VideoCapture(video_path.name) | |
total_frames, fps = int(video.get(cv2.CAP_PROP_FRAME_COUNT)), video.get(cv2.CAP_PROP_FPS) | |
frames_to_skip = int(fps * 2) | |
for curr_frame in range(0, total_frames - 1, frames_to_skip): | |
if len(base64Frames) >= 10: break | |
video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame) | |
success, frame = video.read() | |
if not success: break | |
_, buffer = cv2.imencode(".jpg", frame) | |
base64Frames.append(base64.b64encode(buffer).decode("utf-8")) | |
video.release() | |
progress(0.8, desc="🌀 Consulting Oracle...") | |
user_content = [{"type": "text", "text": f"{prompt}\n\n--- Audio Transcript ---\n{transcript_text}"}, *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)] | |
yield from invoke_oracle(api_key, model, "You are a video analyst. Respond in Markdown.", user_content, history) | |
def generate_speech(api_key, tts_model, voice, text, language, format, progress=gr.Progress()): | |
"""A ritual to give voice to the written word, in any tongue.""" | |
get_key(api_key) | |
# Step 1: Translate the text if the language is not English | |
progress(0.2, desc=f"Translating to {language}...") | |
translated_text = text | |
if language != "English": | |
try: | |
response = o.chat.completions.create( | |
model="gpt-4o", | |
messages=[ | |
{"role": "system", "content": f"You are a translator. Translate the following text to {language}. Output only the translated text."}, | |
{"role": "user", "content": text} | |
], | |
temperature=0 | |
) | |
translated_text = response.choices[0].message.content | |
except Exception as e: | |
raise gr.Error(f"Translation failed: {e}") | |
# Step 2: Generate speech from the (possibly translated) text | |
progress(0.6, desc="Summoning voice...") | |
speech_file_path = Path(__file__).parent / f"speech.{format}" | |
try: | |
response = o.audio.speech.create( | |
model=tts_model, | |
voice=voice, | |
input=translated_text, | |
response_format=format | |
) | |
response.stream_to_file(speech_file_path) | |
except Exception as e: | |
raise gr.Error(f"Speech generation failed: {e}") | |
progress(1.0, desc="Voice summoned!") | |
return str(speech_file_path), translated_text | |
# 🔮 UI | |
with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary_hue="orange")) as demo: | |
initial_state = load_state() | |
app_state = gr.State(initial_state) | |
gr.Markdown(H1.format(UI_TITLE)) | |
with gr.Accordion("🔑 Eldritch Key & Oracle Selection", open=True): | |
with gr.Row(): | |
api_key_box = gr.Textbox(label="🔑 Key", type="password", placeholder="sk-...", scale=3, value=initial_state.get('api_key', '')) | |
save_btn = gr.Button("💾", scale=1) | |
status_txt = gr.Textbox(interactive=False, scale=1, label="Status") | |
model_selector = gr.Dropdown(choices=list(MODELS.keys()), label="🔮 Oracle", value=initial_state.get('model', "GPT-4o ✨")) | |
save_btn.click(save_key, inputs=api_key_box, outputs=status_txt) | |
chatbot = gr.Chatbot(height=500, label="📜 Scroll of Conversation", type='messages', value=initial_state.get('chatbot', [])) | |
with gr.Tabs(): | |
with gr.TabItem("💬 Chat"): | |
text_prompt = gr.Textbox(label="Your Quest:", placeholder="Type your message...", value=initial_state.get('text_prompt', '')) | |
text_event = text_prompt.submit(fn=lambda api_key, model, prompt, hist: invoke_oracle(api_key, model, "You are a helpful AI assistant.", [{"type": "text", "text": prompt}], hist), inputs=[api_key_box, model_selector, text_prompt, chatbot], outputs=chatbot) | |
with gr.TabItem("🖼️ Image"): | |
with gr.Row(): | |
image_input = gr.File(label="Upload Image", type="file") | |
image_output = gr.Image(label="Your Image", type="filepath", interactive=False) | |
image_prompt = gr.Textbox(label="Image Prompt:", value=initial_state.get('image_prompt', "What is in this image?")) | |
image_btn = gr.Button("👁️ Summon Vision") | |
image_input.change(lambda x: x, inputs=image_input, outputs=image_output) | |
image_event = image_btn.click(summon_vision_from_image, [api_key_box, model_selector, image_prompt, image_input, chatbot], chatbot) | |
with gr.TabItem("🎤 Audio"): | |
audio_input = gr.File(label="Upload Audio", type="file") | |
audio_prompt = gr.Textbox(label="Audio Prompt:", value=initial_state.get('audio_prompt', "Summarize this audio.")) | |
audio_btn = gr.Button("🗣️ Summon Echo") | |
audio_event = audio_btn.click(summon_echo_from_audio, [api_key_box, model_selector, audio_prompt, audio_input, chatbot], chatbot) | |
with gr.TabItem("🎥 Video"): | |
video_input = gr.File(label="Upload Video", type="file") | |
video_prompt = gr.Textbox(label="Video Prompt:", value=initial_state.get('video_prompt', "Summarize this video.")) | |
video_btn = gr.Button("🎬 Summon Chronicle") | |
video_event = video_btn.click(summon_chronicle_from_video, [api_key_box, model_selector, video_prompt, video_input, chatbot], chatbot) | |
with gr.TabItem("📄 Document"): | |
doc_input = gr.File(label="Upload PDF or TXT", type="file") | |
doc_prompt = gr.Textbox(label="Document Prompt:", value=initial_state.get('doc_prompt', "Summarize this document.")) | |
doc_btn = gr.Button("📖 Summon Wisdom") | |
doc_event = doc_btn.click(summon_wisdom_from_text, [api_key_box, model_selector, doc_prompt, doc_input, chatbot], chatbot) | |
with gr.TabItem("🔊 Speech Synthesis"): | |
gr.Markdown(H2.format("Give Voice to Words")) | |
tts_language = gr.Radio(choices=list(LANGUAGES.keys()), label="🈯 Language", value=initial_state.get('tts_language', "🇬🇧 English")) | |
with gr.Row(): | |
tts_voice = gr.Dropdown(choices=VOICES, label="🗣️ Voice", value=initial_state.get('tts_voice', "alloy")) | |
tts_model_select = gr.Dropdown(choices=TTS_MODELS, label="🧠 TTS Model", value=initial_state.get('tts_model', "gpt-4o-mini-tts")) | |
tts_format = gr.Dropdown(choices=FORMATS, label="📦 Format", value=initial_state.get('tts_format', "mp3")) | |
tts_text_input = gr.Textbox(label="📜 Text to Speak", lines=4, placeholder="Enter text here...", value=initial_state.get('tts_text', '')) | |
tts_btn = gr.Button("🔊 Generate Speech") | |
tts_translated_text = gr.Textbox(label="Translated Text (Output)", interactive=False) | |
tts_audio_output = gr.Audio(label="🎧 Spoken Word", type="filepath") | |
tts_event = tts_btn.click(generate_speech, [api_key_box, tts_model_select, tts_voice, tts_text_input, tts_language, tts_format], [tts_audio_output, tts_translated_text]) | |
# --- Autosave Event Listeners --- | |
components_to_save = { | |
'api_key': api_key_box, 'model': model_selector, 'text_prompt': text_prompt, | |
'image_prompt': image_prompt, 'audio_prompt': audio_prompt, 'video_prompt': video_prompt, | |
'doc_prompt': doc_prompt, 'tts_language': tts_language, 'tts_voice': tts_voice, | |
'tts_model': tts_model_select, 'tts_format': tts_format, 'tts_text': tts_text_input | |
} | |
for key, component in components_to_save.items(): | |
component.change(update_and_save, [gr.State(key), component, app_state], app_state) | |
for event in [text_event, image_event, audio_event, video_event, doc_event]: | |
event.then(lambda history, state: update_and_save('chatbot', history, state), [chatbot, app_state], app_state) | |
if __name__ == "__main__": | |
demo.launch(share=True, debug=True) |