Spaces:

awacke1
/

GradioGPTGameGenerator

Runtime error

File size: 14,254 Bytes

import os
import gradio as gr
import openai as o
import base64
import fitz  # PyMuPDF
import cv2
from moviepy.video.io.VideoFileClip import VideoFileClip
import json
import requests
import re
from io import BytesIO
from PIL import Image
from pathlib import Path

# 📜 CONFIG
UI_TITLE = "✨🧙‍♂️🔮 GPT-4o Omni-Oracle"
KEY_FILE = "key.txt"
STATE_FILE = "app_state.json"
MODELS = {
    "GPT-4o ✨": "gpt-4o",
    "o3 (Advanced Reasoning) �": "gpt-4-turbo", # Placeholder
    "o4-mini (Fastest) ⚡": "gpt-4-turbo", # Placeholder
    "o4-mini-high (Vision) 👁️‍🗨️": "gpt-4o", # Placeholder
    "GPT-4.5 (Research) 🔬": "gpt-4-turbo-preview", # Placeholder
    "GPT-4.1 (Analysis) 💻": "gpt-4-turbo", # Placeholder
    "GPT-4.1-mini (Everyday) ☕": "gpt-4-turbo", # Placeholder
    "GPT-4 Turbo 🚀": "gpt-4-turbo",
    "GPT-3.5 Turbo ⚡": "gpt-3.5-turbo",
}
VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"]
TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"]
FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"]
LANGUAGES = {
    "🇬🇧 English": "English", "🇨🇳 Chinese": "Chinese", "🇫🇷 French": "French", "🇩🇪 German": "German", 
    "🇮🇱 Hebrew": "Hebrew", "🇮🇳 Hindi": "Hindi", "🇯🇵 Japanese": "Japanese", "🇳🇿 Maori": "Maori", 
    "🇷🇺 Russian": "Russian", "🇪🇸 Spanish": "Spanish"
}


# 🎨 STYLE
H1 = "# <font size='7'>{0}</font>"
H2 = "## <font size='6'>{0}</font>"

# 🪄 HELPERS, LORE & AUTOSAVE RITUALS
def save_state(data: dict):
    """A rune that inscribes the session's memory onto a JSON scroll."""
    with open(STATE_FILE, 'w') as f:
        json.dump(data, f, indent=4)

def load_state() -> dict:
    """A ritual to recall the session's memory from the JSON scroll."""
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE, 'r') as f:
            try:
                return json.load(f)
            except json.JSONDecodeError:
                return {}
    return {}

def update_and_save(key: str, value, state: dict):
    """A binding spell that updates a memory and immediately inscribes it."""
    state[key] = value
    save_state(state)
    return state

def save_key(k: str) -> str:
    "💾🔑 A rune to bind the Eldritch Key."
    if not k or not k.strip(): return "🚫 Empty Key"
    with open(KEY_FILE, "w") as f: f.write(k.strip())
    return "🔑✅ Key Saved!"

def get_key(k: str) -> str:
    "📜🔑 A ritual to summon the Eldritch Key."
    k = k.strip() if k and k.strip() else (open(KEY_FILE).read().strip() if os.path.exists(KEY_FILE) else os.getenv("OPENAI_KEY", ""))
    if not k: raise gr.Error("❗🔑 An Eldritch Key (OpenAI API Key) is required.")
    o.api_key = k
    return k

def file_to_base64(file_path):
    with open(file_path, "rb") as f:
        return base64.b64encode(f.read()).decode('utf-8')

def invoke_oracle(scribe_key: str, model_name: str, system_prompt: str, user_content: list, history: list):
    get_key(scribe_key)
    messages = history + [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}]
    try:
        prophecy = o.chat.completions.create(model=model_name, messages=messages, stream=True)
        history.append({"role": "user", "content": "..."}) 
        history.append({"role": "assistant", "content": ""})
        for chunk in prophecy:
            if chunk.choices[0].delta.content:
                history[-1]['content'] += chunk.choices[0].delta.content
                yield history
    except Exception as e:
        yield history + [{"role": "assistant", "content": f"🧙‍♂️🔮 A magical disturbance occurred: {str(e)}"}]

# --- Modality-Specific Summoning Rituals ---

def summon_vision_from_image(api_key, model, prompt, image_path, history):
    if image_path is None: raise gr.Error("An image must be provided.")
    b64_image = file_to_base64(image_path.name)
    user_content = [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}"}}]
    yield from invoke_oracle(api_key, model, "You are an assistant that analyzes images. Respond in Markdown.", user_content, history)

def summon_echo_from_audio(api_key, model, prompt, audio_path, history):
    if audio_path is None: raise gr.Error("An audio file must be provided.")
    get_key(api_key)
    with open(audio_path.name, "rb") as audio_file:
        transcription = o.audio.transcriptions.create(model="whisper-1", file=audio_file)
    full_prompt = f"{prompt}\n\n--- Transcription ---\n{transcription.text}"
    yield from invoke_oracle(api_key, model, "You analyze audio transcripts. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history)

def summon_wisdom_from_text(api_key, model, prompt, file_path, history):
    if file_path is None: raise gr.Error("A file must be provided.")
    text_content = ""
    if file_path.name.lower().endswith('.pdf'):
        with fitz.open(file_path.name) as doc:
            text_content = "".join(page.get_text() for page in doc)
    else:
        with open(file_path.name, 'r', encoding='utf-8') as f:
            text_content = f.read()
    full_prompt = f"{prompt}\n\n--- Document Content ---\n{text_content[:10000]}..."
    yield from invoke_oracle(api_key, model, "You analyze documents. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history)

def summon_chronicle_from_video(api_key, model, prompt, video_path, history, progress=gr.Progress()):
    if video_path is None: raise gr.Error("A video must be provided.")
    get_key(api_key)
    base_video_path, _ = os.path.splitext(video_path.name)
    progress(0.1, desc="🔮 Extracting Audio...")
    audio_path = f"{base_video_path}.mp3"
    transcript_text = "No audio found."
    try:
        with VideoFileClip(video_path.name) as clip:
            clip.audio.write_audiofile(audio_path, bitrate="32k", logger=None)
        progress(0.3, desc="🎤 Transcribing Audio...")
        with open(audio_path, "rb") as audio_file:
            transcript_text = o.audio.transcriptions.create(model="whisper-1", file=audio_file).text
    except Exception as e:
        print(f"Audio failed: {e}")
    progress(0.6, desc="🖼️ Sampling Frames...")
    base64Frames = []
    video = cv2.VideoCapture(video_path.name)
    total_frames, fps = int(video.get(cv2.CAP_PROP_FRAME_COUNT)), video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * 2)
    for curr_frame in range(0, total_frames - 1, frames_to_skip):
        if len(base64Frames) >= 10: break
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success: break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
    video.release()
    progress(0.8, desc="🌀 Consulting Oracle...")
    user_content = [{"type": "text", "text": f"{prompt}\n\n--- Audio Transcript ---\n{transcript_text}"}, *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)]
    yield from invoke_oracle(api_key, model, "You are a video analyst. Respond in Markdown.", user_content, history)

def generate_speech(api_key, tts_model, voice, text, language, format, progress=gr.Progress()):
    """A ritual to give voice to the written word, in any tongue."""
    get_key(api_key)
    
    # Step 1: Translate the text if the language is not English
    progress(0.2, desc=f"Translating to {language}...")
    translated_text = text
    if language != "English":
        try:
            response = o.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": f"You are a translator. Translate the following text to {language}. Output only the translated text."},
                    {"role": "user", "content": text}
                ],
                temperature=0
            )
            translated_text = response.choices[0].message.content
        except Exception as e:
            raise gr.Error(f"Translation failed: {e}")

    # Step 2: Generate speech from the (possibly translated) text
    progress(0.6, desc="Summoning voice...")
    speech_file_path = Path(__file__).parent / f"speech.{format}"
    try:
        response = o.audio.speech.create(
            model=tts_model,
            voice=voice,
            input=translated_text,
            response_format=format
        )
        response.stream_to_file(speech_file_path)
    except Exception as e:
        raise gr.Error(f"Speech generation failed: {e}")
        
    progress(1.0, desc="Voice summoned!")
    return str(speech_file_path), translated_text

# 🔮 UI
with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary_hue="orange")) as demo:
    initial_state = load_state()
    app_state = gr.State(initial_state)
    gr.Markdown(H1.format(UI_TITLE))

    with gr.Accordion("🔑 Eldritch Key & Oracle Selection", open=True):
        with gr.Row():
            api_key_box = gr.Textbox(label="🔑 Key", type="password", placeholder="sk-...", scale=3, value=initial_state.get('api_key', ''))
            save_btn = gr.Button("💾", scale=1)
            status_txt = gr.Textbox(interactive=False, scale=1, label="Status")
        model_selector = gr.Dropdown(choices=list(MODELS.keys()), label="🔮 Oracle", value=initial_state.get('model', "GPT-4o ✨"))
        save_btn.click(save_key, inputs=api_key_box, outputs=status_txt)

    chatbot = gr.Chatbot(height=500, label="📜 Scroll of Conversation", type='messages', value=initial_state.get('chatbot', []))

    with gr.Tabs():
        with gr.TabItem("💬 Chat"):
            text_prompt = gr.Textbox(label="Your Quest:", placeholder="Type your message...", value=initial_state.get('text_prompt', ''))
            text_event = text_prompt.submit(fn=lambda api_key, model, prompt, hist: invoke_oracle(api_key, model, "You are a helpful AI assistant.", [{"type": "text", "text": prompt}], hist), inputs=[api_key_box, model_selector, text_prompt, chatbot], outputs=chatbot)

        with gr.TabItem("🖼️ Image"):
            with gr.Row():
                image_input = gr.File(label="Upload Image", type="file")
                image_output = gr.Image(label="Your Image", type="filepath", interactive=False)
            image_prompt = gr.Textbox(label="Image Prompt:", value=initial_state.get('image_prompt', "What is in this image?"))
            image_btn = gr.Button("👁️ Summon Vision")
            image_input.change(lambda x: x, inputs=image_input, outputs=image_output)
            image_event = image_btn.click(summon_vision_from_image, [api_key_box, model_selector, image_prompt, image_input, chatbot], chatbot)

        with gr.TabItem("🎤 Audio"):
            audio_input = gr.File(label="Upload Audio", type="file")
            audio_prompt = gr.Textbox(label="Audio Prompt:", value=initial_state.get('audio_prompt', "Summarize this audio."))
            audio_btn = gr.Button("🗣️ Summon Echo")
            audio_event = audio_btn.click(summon_echo_from_audio, [api_key_box, model_selector, audio_prompt, audio_input, chatbot], chatbot)

        with gr.TabItem("🎥 Video"):
            video_input = gr.File(label="Upload Video", type="file")
            video_prompt = gr.Textbox(label="Video Prompt:", value=initial_state.get('video_prompt', "Summarize this video."))
            video_btn = gr.Button("🎬 Summon Chronicle")
            video_event = video_btn.click(summon_chronicle_from_video, [api_key_box, model_selector, video_prompt, video_input, chatbot], chatbot)
            
        with gr.TabItem("📄 Document"):
            doc_input = gr.File(label="Upload PDF or TXT", type="file")
            doc_prompt = gr.Textbox(label="Document Prompt:", value=initial_state.get('doc_prompt', "Summarize this document."))
            doc_btn = gr.Button("📖 Summon Wisdom")
            doc_event = doc_btn.click(summon_wisdom_from_text, [api_key_box, model_selector, doc_prompt, doc_input, chatbot], chatbot)
            
        with gr.TabItem("🔊 Speech Synthesis"):
            gr.Markdown(H2.format("Give Voice to Words"))
            tts_language = gr.Radio(choices=list(LANGUAGES.keys()), label="🈯 Language", value=initial_state.get('tts_language', "🇬🇧 English"))
            with gr.Row():
                tts_voice = gr.Dropdown(choices=VOICES, label="🗣️ Voice", value=initial_state.get('tts_voice', "alloy"))
                tts_model_select = gr.Dropdown(choices=TTS_MODELS, label="🧠 TTS Model", value=initial_state.get('tts_model', "gpt-4o-mini-tts"))
                tts_format = gr.Dropdown(choices=FORMATS, label="📦 Format", value=initial_state.get('tts_format', "mp3"))
            tts_text_input = gr.Textbox(label="📜 Text to Speak", lines=4, placeholder="Enter text here...", value=initial_state.get('tts_text', ''))
            tts_btn = gr.Button("🔊 Generate Speech")
            tts_translated_text = gr.Textbox(label="Translated Text (Output)", interactive=False)
            tts_audio_output = gr.Audio(label="🎧 Spoken Word", type="filepath")
            tts_event = tts_btn.click(generate_speech, [api_key_box, tts_model_select, tts_voice, tts_text_input, tts_language, tts_format], [tts_audio_output, tts_translated_text])

    # --- Autosave Event Listeners ---
    components_to_save = {
        'api_key': api_key_box, 'model': model_selector, 'text_prompt': text_prompt,
        'image_prompt': image_prompt, 'audio_prompt': audio_prompt, 'video_prompt': video_prompt,
        'doc_prompt': doc_prompt, 'tts_language': tts_language, 'tts_voice': tts_voice,
        'tts_model': tts_model_select, 'tts_format': tts_format, 'tts_text': tts_text_input
    }
    for key, component in components_to_save.items():
        component.change(update_and_save, [gr.State(key), component, app_state], app_state)

    for event in [text_event, image_event, audio_event, video_event, doc_event]:
        event.then(lambda history, state: update_and_save('chatbot', history, state), [chatbot, app_state], app_state)

if __name__ == "__main__":
    demo.launch(share=True, debug=True)