Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
import openai as o | |
import base64 | |
import fitz # PyMuPDF | |
import cv2 | |
from moviepy.video.io.VideoFileClip import VideoFileClip | |
import json | |
import requests | |
import re | |
from io import BytesIO | |
from PIL import Image | |
from pathlib import Path | |
import numpy as np | |
from fastrtc.gradio import WebRTC | |
import soundfile as sf | |
# 📜 CONFIG | |
UI_TITLE = "✨🧙♂️🔮 GPT-4o Omni-Oracle" | |
KEY_FILE = "key.txt" | |
STATE_FILE = "app_state.json" | |
MODELS = { | |
"GPT-4o ✨": "gpt-4o", | |
"o3 (Advanced Reasoning) 🧠": "gpt-4-turbo", # Placeholder | |
"o4-mini (Fastest) ⚡": "gpt-4-turbo", # Placeholder | |
"o4-mini-high (Vision) 👁️🗨️": "gpt-4o", # Placeholder | |
"GPT-4.5 (Research) 🔬": "gpt-4-turbo-preview", # Placeholder | |
"GPT-4.1 (Analysis) 💻": "gpt-4-turbo", # Placeholder | |
"GPT-4.1-mini (Everyday) ☕": "gpt-4-turbo", # Placeholder | |
} | |
VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"] | |
TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"] | |
FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"] | |
LANGUAGES = { | |
"🇬🇧 English": "English", "🇨🇳 Chinese": "Chinese", "🇫🇷 French": "French", "🇩🇪 German": "German", | |
"🇮🇱 Hebrew": "Hebrew", "🇮🇳 Hindi": "Hindi", "🇯🇵 Japanese": "Japanese", "🇳🇿 Maori": "Maori", | |
"🇷🇺 Russian": "Russian", "🇪🇸 Spanish": "Spanish" | |
} | |
# For WebRTC - Replace with your own if deploying on a cloud provider | |
RTC_CONFIGURATION = { | |
"iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}] | |
} | |
# 🎨 STYLE | |
H1 = "# <font size='7'>{0}</font>" | |
H2 = "## <font size='6'>{0}</font>" | |
CSS = """ | |
.my-group {max-width: 500px !important; max-height: 500px !important;} | |
.my-column {display: flex !important; justify-content: center !important; align-items: center !important;} | |
""" | |
# 🪄 HELPERS, LORE & AUTOSAVE RITUALS | |
def save_state(data: dict): | |
with open(STATE_FILE, 'w') as f: | |
json.dump(data, f, indent=4) | |
def load_state() -> dict: | |
if os.path.exists(STATE_FILE): | |
with open(STATE_FILE, 'r') as f: | |
try: | |
return json.load(f) | |
except json.JSONDecodeError: | |
return {} | |
return {} | |
def update_and_save(key: str, value, state: dict): | |
state[key] = value | |
save_state(state) | |
return state | |
def save_key(k: str) -> str: | |
if not k or not k.strip(): return "🚫 Empty Key" | |
with open(KEY_FILE, "w") as f: f.write(k.strip()) | |
return "🔑✅ Key Saved!" | |
def get_key(k: str) -> str: | |
k = k.strip() if k and k.strip() else (open(KEY_FILE).read().strip() if os.path.exists(KEY_FILE) else os.getenv("OPENAI_KEY", "")) | |
if not k: raise gr.Error("❗🔑 An Eldritch Key (OpenAI API Key) is required.") | |
o.api_key = k | |
return k | |
def invoke_oracle(scribe_key: str, model_key: str, system_prompt: str, user_content: list, history: list): | |
get_key(scribe_key) | |
model_name = MODELS.get(model_key, "gpt-4o") | |
messages = history + [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}] | |
try: | |
prophecy = o.chat.completions.create(model=model_name, messages=messages, stream=True) | |
history.append({"role": "user", "content": "..."}) | |
history.append({"role": "assistant", "content": ""}) | |
for chunk in prophecy: | |
if chunk.choices[0].delta.content: | |
history[-1]['content'] += chunk.choices[0].delta.content | |
yield history | |
except Exception as e: | |
yield history + [{"role": "assistant", "content": f"🧙♂️🔮 A magical disturbance occurred: {str(e)}"}] | |
def handle_text_submission(api_key, model, prompt, history): | |
"""A clear path for text quests to the Oracle.""" | |
yield from invoke_oracle(api_key, model, "You are a helpful AI assistant.", [{"type": "text", "text": prompt}], history) | |
# --- Image & Audio Streaming Functions --- | |
def transform_cv2(frame: np.ndarray, transform: str): | |
"""Applies a magical filter to a single frame from a webcam stream.""" | |
if frame is None: return None | |
if transform == "cartoon": | |
img_color = cv2.pyrDown(cv2.pyrDown(frame)) | |
for _ in range(6): | |
img_color = cv2.bilateralFilter(img_color, 9, 9, 7) | |
img_color = cv2.pyrUp(cv2.pyrUp(img_color)) | |
img_edges = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) | |
img_edges = cv2.adaptiveThreshold(cv2.medianBlur(img_edges, 7), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 2) | |
img_edges = cv2.cvtColor(img_edges, cv2.COLOR_GRAY2RGB) | |
return cv2.bitwise_and(img_color, img_edges) | |
elif transform == "edges": | |
return cv2.cvtColor(cv2.Canny(frame, 100, 200), cv2.COLOR_GRAY2BGR) | |
elif transform == "flip": | |
return np.flipud(frame) | |
return frame | |
def transcribe_streaming(api_key, audio_chunk, history_state): | |
"""Transcribes a chunk of audio, keeping context from previous chunks.""" | |
if audio_chunk is None: | |
return history_state, history_state | |
get_key(api_key) | |
sample_rate, data = audio_chunk | |
temp_wav_path = f"temp_chunk_{hash(data.tobytes())}.wav" | |
sf.write(temp_wav_path, data, sample_rate) | |
try: | |
with open(temp_wav_path, "rb") as audio_file: | |
transcript = o.audio.transcriptions.create(model="whisper-1", file=audio_file) | |
new_text = transcript.text | |
except Exception as e: | |
print(f"Transcription error: {e}") | |
new_text = "" | |
finally: | |
if os.path.exists(temp_wav_path): | |
os.remove(temp_wav_path) | |
history_state += new_text + " " | |
return history_state, history_state | |
def generate_speech(api_key, tts_model, voice, text, language_key, format, progress=gr.Progress()): | |
get_key(api_key) | |
language = LANGUAGES.get(language_key, "English") | |
progress(0.2, desc=f"Translating to {language}...") | |
translated_text = text | |
if language != "English": | |
try: | |
response = o.chat.completions.create(model="gpt-4o", messages=[{"role": "system", "content": f"Translate to {language}. Output only the translation."}, {"role": "user", "content": text}], temperature=0) | |
translated_text = response.choices[0].message.content | |
except Exception as e: | |
raise gr.Error(f"Translation failed: {e}") | |
progress(0.6, desc="Summoning voice...") | |
speech_file_path = Path(__file__).parent / f"speech.{format}" | |
try: | |
response = o.audio.speech.create(model=tts_model, voice=voice, input=translated_text, response_format=format) | |
response.stream_to_file(speech_file_path) | |
except Exception as e: | |
raise gr.Error(f"Speech generation failed: {e}") | |
progress(1.0, desc="Voice summoned!") | |
return str(speech_file_path), translated_text | |
# 🔮 UI | |
with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary_hue="orange"), css=CSS) as demo: | |
initial_state = load_state() | |
app_state = gr.State(initial_state) | |
gr.Markdown(H1.format(UI_TITLE)) | |
with gr.Accordion("🔑 Eldritch Key & Oracle Selection", open=True): | |
with gr.Row(): | |
api_key_box = gr.Textbox(label="🔑 Key", type="password", placeholder="sk-...", scale=3, value=initial_state.get('api_key', '')) | |
save_btn = gr.Button("💾", scale=1) | |
status_txt = gr.Textbox(interactive=False, scale=1, label="Status") | |
model_selector = gr.Dropdown(choices=list(MODELS.keys()), label="🔮 Oracle", value=initial_state.get('model', "GPT-4o ✨")) | |
save_btn.click(save_key, inputs=api_key_box, outputs=status_txt) | |
chatbot = gr.Chatbot(height=400, label="📜 Scroll of Conversation", type='messages', value=initial_state.get('chatbot', [])) | |
with gr.Tabs(): | |
with gr.TabItem("💬 Chat"): | |
text_prompt = gr.Textbox(label="Your Quest:", placeholder="Type your message...", value=initial_state.get('text_prompt', '')) | |
text_event = text_prompt.submit(fn=handle_text_submission, inputs=[api_key_box, model_selector, text_prompt, chatbot], outputs=chatbot) | |
with gr.TabItem("🖼️ Streaming Image"): | |
gr.Markdown(H2.format("Live Image Enchantments")) | |
with gr.Column(elem_classes=["my-column"]): | |
with gr.Group(elem_classes=["my-group"]): | |
transform_filter = gr.Dropdown(choices=["cartoon", "edges", "flip"], value="flip", label="Transformation") | |
streaming_image = gr.Image(sources=["webcam"], type="numpy", streaming=True) | |
streaming_image.stream(transform_cv2, [streaming_image, transform_filter], streaming_image, time_limit=30, stream_every=0.1) | |
with gr.TabItem("🎤 Streaming Audio"): | |
gr.Markdown(H2.format("Real-time Transcription Rite")) | |
with gr.Row(): | |
mic_input = gr.Audio(sources="microphone", streaming=True) | |
transcript_output = gr.Textbox(label="Transcript", interactive=False) | |
transcript_state = gr.State(value="") | |
mic_input.stream(transcribe_streaming, [api_key_box, mic_input, transcript_state], [transcript_state, transcript_output], time_limit=30, stream_every=2) | |
with gr.TabItem("👁️ Object Detection (WebRTC)"): | |
gr.Markdown(H2.format("Live Scrying Spell")) | |
gr.HTML("<h3 style='text-align: center'>NOTE: This is a UI placeholder. A separate inference server for the YOLO model is required for this to function.</h3>") | |
with gr.Column(elem_classes=["my-column"]): | |
with gr.Group(elem_classes=["my-group"]): | |
webrtc_stream = WebRTC(label="Stream", rtc_configuration=RTC_CONFIGURATION) | |
conf_threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, step=0.05, value=0.30) | |
# Placeholder for the actual stream event handler which would call a loaded YOLOv10 model | |
# def detection_placeholder(image, conf): return image | |
# webrtc_stream.stream(fn=detection_placeholder, inputs=[webrtc_stream, conf_threshold], outputs=[webrtc_stream], time_limit=10) | |
with gr.TabItem("🔊 Speech Synthesis"): | |
gr.Markdown(H2.format("Give Voice to Words")) | |
tts_language = gr.Radio(choices=list(LANGUAGES.keys()), label="🈯 Language", value=initial_state.get('tts_language', "🇬🇧 English")) | |
with gr.Row(): | |
tts_voice = gr.Dropdown(choices=VOICES, label="🗣️ Voice", value=initial_state.get('tts_voice', "alloy")) | |
tts_model_select = gr.Dropdown(choices=TTS_MODELS, label="🧠 TTS Model", value=initial_state.get('tts_model', "gpt-4o-mini-tts")) | |
tts_format = gr.Dropdown(choices=FORMATS, label="📦 Format", value=initial_state.get('tts_format', "mp3")) | |
tts_text_input = gr.Textbox(label="📜 Text to Speak", lines=4, placeholder="Enter text here...", value=initial_state.get('tts_text', '')) | |
tts_btn = gr.Button("🔊 Generate Speech") | |
tts_translated_text = gr.Textbox(label="Translated Text (Output)", interactive=False) | |
tts_audio_output = gr.Audio(label="🎧 Spoken Word", type="filepath") | |
tts_event = tts_btn.click(generate_speech, [api_key_box, tts_model_select, tts_voice, tts_text_input, tts_language, tts_format], [tts_audio_output, tts_translated_text]) | |
# --- Autosave Event Listeners --- | |
components_to_save = { | |
'api_key': api_key_box, 'model': model_selector, 'text_prompt': text_prompt, | |
'tts_language': tts_language, 'tts_voice': tts_voice, | |
'tts_model': tts_model_select, 'tts_format': tts_format, 'tts_text': tts_text_input | |
} | |
for key, component in components_to_save.items(): | |
component.change(update_and_save, [gr.State(key), component, app_state], app_state) | |
text_event.then(lambda history, state: update_and_save('chatbot', history, state), [chatbot, app_state], app_state) | |
if __name__ == "__main__": | |
demo.launch(share=True, debug=True) | |