File size: 14,254 Bytes
a0eb915
 
 
45461f4
 
 
 
 
 
 
 
 
9675144
a0eb915
 
45461f4
a0eb915
45461f4
a0eb915
 
9675144
 
 
 
 
 
45461f4
 
a0eb915
9675144
 
 
 
 
 
 
 
 
a0eb915
 
 
 
 
45461f4
 
 
 
 
 
 
 
 
 
 
 
 
9675144
45461f4
 
 
 
 
 
 
 
a0eb915
9675144
a0eb915
 
45461f4
a0eb915
 
9675144
a0eb915
9675144
a0eb915
 
 
45461f4
 
 
 
 
 
 
a0eb915
45461f4
 
 
a0eb915
 
45461f4
 
a0eb915
9675144
a0eb915
45461f4
a0eb915
45461f4
9675144
45461f4
9675144
 
45461f4
 
9675144
45461f4
 
 
 
9675144
45461f4
 
 
 
 
 
 
 
 
 
 
9675144
45461f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9675144
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a0eb915
 
 
45461f4
 
a0eb915
 
45461f4
a0eb915
45461f4
a0eb915
 
45461f4
 
 
 
 
 
 
 
9675144
a0eb915
45461f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9675144
 
 
 
 
 
 
 
 
 
 
 
 
45461f4
 
9675144
 
 
 
 
 
 
 
 
45461f4
 
a0eb915
 
c381bf2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
import os
import gradio as gr
import openai as o
import base64
import fitz  # PyMuPDF
import cv2
from moviepy.video.io.VideoFileClip import VideoFileClip
import json
import requests
import re
from io import BytesIO
from PIL import Image
from pathlib import Path

# 📜 CONFIG
UI_TITLE = "✨🧙‍♂️🔮 GPT-4o Omni-Oracle"
KEY_FILE = "key.txt"
STATE_FILE = "app_state.json"
MODELS = {
    "GPT-4o ✨": "gpt-4o",
    "o3 (Advanced Reasoning) �": "gpt-4-turbo", # Placeholder
    "o4-mini (Fastest) ⚡": "gpt-4-turbo", # Placeholder
    "o4-mini-high (Vision) 👁️‍🗨️": "gpt-4o", # Placeholder
    "GPT-4.5 (Research) 🔬": "gpt-4-turbo-preview", # Placeholder
    "GPT-4.1 (Analysis) 💻": "gpt-4-turbo", # Placeholder
    "GPT-4.1-mini (Everyday) ☕": "gpt-4-turbo", # Placeholder
    "GPT-4 Turbo 🚀": "gpt-4-turbo",
    "GPT-3.5 Turbo ⚡": "gpt-3.5-turbo",
}
VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"]
TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"]
FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"]
LANGUAGES = {
    "🇬🇧 English": "English", "🇨🇳 Chinese": "Chinese", "🇫🇷 French": "French", "🇩🇪 German": "German", 
    "🇮🇱 Hebrew": "Hebrew", "🇮🇳 Hindi": "Hindi", "🇯🇵 Japanese": "Japanese", "🇳🇿 Maori": "Maori", 
    "🇷🇺 Russian": "Russian", "🇪🇸 Spanish": "Spanish"
}


# 🎨 STYLE
H1 = "# <font size='7'>{0}</font>"
H2 = "## <font size='6'>{0}</font>"

# 🪄 HELPERS, LORE & AUTOSAVE RITUALS
def save_state(data: dict):
    """A rune that inscribes the session's memory onto a JSON scroll."""
    with open(STATE_FILE, 'w') as f:
        json.dump(data, f, indent=4)

def load_state() -> dict:
    """A ritual to recall the session's memory from the JSON scroll."""
    if os.path.exists(STATE_FILE):
        with open(STATE_FILE, 'r') as f:
            try:
                return json.load(f)
            except json.JSONDecodeError:
                return {}
    return {}

def update_and_save(key: str, value, state: dict):
    """A binding spell that updates a memory and immediately inscribes it."""
    state[key] = value
    save_state(state)
    return state

def save_key(k: str) -> str:
    "💾🔑 A rune to bind the Eldritch Key."
    if not k or not k.strip(): return "🚫 Empty Key"
    with open(KEY_FILE, "w") as f: f.write(k.strip())
    return "🔑✅ Key Saved!"

def get_key(k: str) -> str:
    "📜🔑 A ritual to summon the Eldritch Key."
    k = k.strip() if k and k.strip() else (open(KEY_FILE).read().strip() if os.path.exists(KEY_FILE) else os.getenv("OPENAI_KEY", ""))
    if not k: raise gr.Error("❗🔑 An Eldritch Key (OpenAI API Key) is required.")
    o.api_key = k
    return k

def file_to_base64(file_path):
    with open(file_path, "rb") as f:
        return base64.b64encode(f.read()).decode('utf-8')

def invoke_oracle(scribe_key: str, model_name: str, system_prompt: str, user_content: list, history: list):
    get_key(scribe_key)
    messages = history + [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}]
    try:
        prophecy = o.chat.completions.create(model=model_name, messages=messages, stream=True)
        history.append({"role": "user", "content": "..."}) 
        history.append({"role": "assistant", "content": ""})
        for chunk in prophecy:
            if chunk.choices[0].delta.content:
                history[-1]['content'] += chunk.choices[0].delta.content
                yield history
    except Exception as e:
        yield history + [{"role": "assistant", "content": f"🧙‍♂️🔮 A magical disturbance occurred: {str(e)}"}]

# --- Modality-Specific Summoning Rituals ---

def summon_vision_from_image(api_key, model, prompt, image_path, history):
    if image_path is None: raise gr.Error("An image must be provided.")
    b64_image = file_to_base64(image_path.name)
    user_content = [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}"}}]
    yield from invoke_oracle(api_key, model, "You are an assistant that analyzes images. Respond in Markdown.", user_content, history)

def summon_echo_from_audio(api_key, model, prompt, audio_path, history):
    if audio_path is None: raise gr.Error("An audio file must be provided.")
    get_key(api_key)
    with open(audio_path.name, "rb") as audio_file:
        transcription = o.audio.transcriptions.create(model="whisper-1", file=audio_file)
    full_prompt = f"{prompt}\n\n--- Transcription ---\n{transcription.text}"
    yield from invoke_oracle(api_key, model, "You analyze audio transcripts. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history)

def summon_wisdom_from_text(api_key, model, prompt, file_path, history):
    if file_path is None: raise gr.Error("A file must be provided.")
    text_content = ""
    if file_path.name.lower().endswith('.pdf'):
        with fitz.open(file_path.name) as doc:
            text_content = "".join(page.get_text() for page in doc)
    else:
        with open(file_path.name, 'r', encoding='utf-8') as f:
            text_content = f.read()
    full_prompt = f"{prompt}\n\n--- Document Content ---\n{text_content[:10000]}..."
    yield from invoke_oracle(api_key, model, "You analyze documents. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history)

def summon_chronicle_from_video(api_key, model, prompt, video_path, history, progress=gr.Progress()):
    if video_path is None: raise gr.Error("A video must be provided.")
    get_key(api_key)
    base_video_path, _ = os.path.splitext(video_path.name)
    progress(0.1, desc="🔮 Extracting Audio...")
    audio_path = f"{base_video_path}.mp3"
    transcript_text = "No audio found."
    try:
        with VideoFileClip(video_path.name) as clip:
            clip.audio.write_audiofile(audio_path, bitrate="32k", logger=None)
        progress(0.3, desc="🎤 Transcribing Audio...")
        with open(audio_path, "rb") as audio_file:
            transcript_text = o.audio.transcriptions.create(model="whisper-1", file=audio_file).text
    except Exception as e:
        print(f"Audio failed: {e}")
    progress(0.6, desc="🖼️ Sampling Frames...")
    base64Frames = []
    video = cv2.VideoCapture(video_path.name)
    total_frames, fps = int(video.get(cv2.CAP_PROP_FRAME_COUNT)), video.get(cv2.CAP_PROP_FPS)
    frames_to_skip = int(fps * 2)
    for curr_frame in range(0, total_frames - 1, frames_to_skip):
        if len(base64Frames) >= 10: break
        video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
        success, frame = video.read()
        if not success: break
        _, buffer = cv2.imencode(".jpg", frame)
        base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
    video.release()
    progress(0.8, desc="🌀 Consulting Oracle...")
    user_content = [{"type": "text", "text": f"{prompt}\n\n--- Audio Transcript ---\n{transcript_text}"}, *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)]
    yield from invoke_oracle(api_key, model, "You are a video analyst. Respond in Markdown.", user_content, history)

def generate_speech(api_key, tts_model, voice, text, language, format, progress=gr.Progress()):
    """A ritual to give voice to the written word, in any tongue."""
    get_key(api_key)
    
    # Step 1: Translate the text if the language is not English
    progress(0.2, desc=f"Translating to {language}...")
    translated_text = text
    if language != "English":
        try:
            response = o.chat.completions.create(
                model="gpt-4o",
                messages=[
                    {"role": "system", "content": f"You are a translator. Translate the following text to {language}. Output only the translated text."},
                    {"role": "user", "content": text}
                ],
                temperature=0
            )
            translated_text = response.choices[0].message.content
        except Exception as e:
            raise gr.Error(f"Translation failed: {e}")

    # Step 2: Generate speech from the (possibly translated) text
    progress(0.6, desc="Summoning voice...")
    speech_file_path = Path(__file__).parent / f"speech.{format}"
    try:
        response = o.audio.speech.create(
            model=tts_model,
            voice=voice,
            input=translated_text,
            response_format=format
        )
        response.stream_to_file(speech_file_path)
    except Exception as e:
        raise gr.Error(f"Speech generation failed: {e}")
        
    progress(1.0, desc="Voice summoned!")
    return str(speech_file_path), translated_text

# 🔮 UI
with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary_hue="orange")) as demo:
    initial_state = load_state()
    app_state = gr.State(initial_state)
    gr.Markdown(H1.format(UI_TITLE))

    with gr.Accordion("🔑 Eldritch Key & Oracle Selection", open=True):
        with gr.Row():
            api_key_box = gr.Textbox(label="🔑 Key", type="password", placeholder="sk-...", scale=3, value=initial_state.get('api_key', ''))
            save_btn = gr.Button("💾", scale=1)
            status_txt = gr.Textbox(interactive=False, scale=1, label="Status")
        model_selector = gr.Dropdown(choices=list(MODELS.keys()), label="🔮 Oracle", value=initial_state.get('model', "GPT-4o ✨"))
        save_btn.click(save_key, inputs=api_key_box, outputs=status_txt)

    chatbot = gr.Chatbot(height=500, label="📜 Scroll of Conversation", type='messages', value=initial_state.get('chatbot', []))

    with gr.Tabs():
        with gr.TabItem("💬 Chat"):
            text_prompt = gr.Textbox(label="Your Quest:", placeholder="Type your message...", value=initial_state.get('text_prompt', ''))
            text_event = text_prompt.submit(fn=lambda api_key, model, prompt, hist: invoke_oracle(api_key, model, "You are a helpful AI assistant.", [{"type": "text", "text": prompt}], hist), inputs=[api_key_box, model_selector, text_prompt, chatbot], outputs=chatbot)

        with gr.TabItem("🖼️ Image"):
            with gr.Row():
                image_input = gr.File(label="Upload Image", type="file")
                image_output = gr.Image(label="Your Image", type="filepath", interactive=False)
            image_prompt = gr.Textbox(label="Image Prompt:", value=initial_state.get('image_prompt', "What is in this image?"))
            image_btn = gr.Button("👁️ Summon Vision")
            image_input.change(lambda x: x, inputs=image_input, outputs=image_output)
            image_event = image_btn.click(summon_vision_from_image, [api_key_box, model_selector, image_prompt, image_input, chatbot], chatbot)

        with gr.TabItem("🎤 Audio"):
            audio_input = gr.File(label="Upload Audio", type="file")
            audio_prompt = gr.Textbox(label="Audio Prompt:", value=initial_state.get('audio_prompt', "Summarize this audio."))
            audio_btn = gr.Button("🗣️ Summon Echo")
            audio_event = audio_btn.click(summon_echo_from_audio, [api_key_box, model_selector, audio_prompt, audio_input, chatbot], chatbot)

        with gr.TabItem("🎥 Video"):
            video_input = gr.File(label="Upload Video", type="file")
            video_prompt = gr.Textbox(label="Video Prompt:", value=initial_state.get('video_prompt', "Summarize this video."))
            video_btn = gr.Button("🎬 Summon Chronicle")
            video_event = video_btn.click(summon_chronicle_from_video, [api_key_box, model_selector, video_prompt, video_input, chatbot], chatbot)
            
        with gr.TabItem("📄 Document"):
            doc_input = gr.File(label="Upload PDF or TXT", type="file")
            doc_prompt = gr.Textbox(label="Document Prompt:", value=initial_state.get('doc_prompt', "Summarize this document."))
            doc_btn = gr.Button("📖 Summon Wisdom")
            doc_event = doc_btn.click(summon_wisdom_from_text, [api_key_box, model_selector, doc_prompt, doc_input, chatbot], chatbot)
            
        with gr.TabItem("🔊 Speech Synthesis"):
            gr.Markdown(H2.format("Give Voice to Words"))
            tts_language = gr.Radio(choices=list(LANGUAGES.keys()), label="🈯 Language", value=initial_state.get('tts_language', "🇬🇧 English"))
            with gr.Row():
                tts_voice = gr.Dropdown(choices=VOICES, label="🗣️ Voice", value=initial_state.get('tts_voice', "alloy"))
                tts_model_select = gr.Dropdown(choices=TTS_MODELS, label="🧠 TTS Model", value=initial_state.get('tts_model', "gpt-4o-mini-tts"))
                tts_format = gr.Dropdown(choices=FORMATS, label="📦 Format", value=initial_state.get('tts_format', "mp3"))
            tts_text_input = gr.Textbox(label="📜 Text to Speak", lines=4, placeholder="Enter text here...", value=initial_state.get('tts_text', ''))
            tts_btn = gr.Button("🔊 Generate Speech")
            tts_translated_text = gr.Textbox(label="Translated Text (Output)", interactive=False)
            tts_audio_output = gr.Audio(label="🎧 Spoken Word", type="filepath")
            tts_event = tts_btn.click(generate_speech, [api_key_box, tts_model_select, tts_voice, tts_text_input, tts_language, tts_format], [tts_audio_output, tts_translated_text])

    # --- Autosave Event Listeners ---
    components_to_save = {
        'api_key': api_key_box, 'model': model_selector, 'text_prompt': text_prompt,
        'image_prompt': image_prompt, 'audio_prompt': audio_prompt, 'video_prompt': video_prompt,
        'doc_prompt': doc_prompt, 'tts_language': tts_language, 'tts_voice': tts_voice,
        'tts_model': tts_model_select, 'tts_format': tts_format, 'tts_text': tts_text_input
    }
    for key, component in components_to_save.items():
        component.change(update_and_save, [gr.State(key), component, app_state], app_state)

    for event in [text_event, image_event, audio_event, video_event, doc_event]:
        event.then(lambda history, state: update_and_save('chatbot', history, state), [chatbot, app_state], app_state)

if __name__ == "__main__":
    demo.launch(share=True, debug=True)