awacke1 commited on
Commit
60e3497
·
verified ·
1 Parent(s): ed00060

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +94 -131
app.py CHANGED
@@ -11,6 +11,8 @@ import re
11
  from io import BytesIO
12
  from PIL import Image
13
  from pathlib import Path
 
 
14
 
15
  # 📜 CONFIG
16
  UI_TITLE = "✨🧙‍♂️🔮 GPT-4o Omni-Oracle"
@@ -18,7 +20,7 @@ KEY_FILE = "key.txt"
18
  STATE_FILE = "app_state.json"
19
  MODELS = {
20
  "GPT-4o ✨": "gpt-4o",
21
- "o3 (Advanced Reasoning) ": "gpt-4-turbo", # Placeholder
22
  "o4-mini (Fastest) ⚡": "gpt-4-turbo", # Placeholder
23
  "o4-mini-high (Vision) 👁️‍🗨️": "gpt-4o", # Placeholder
24
  "GPT-4.5 (Research) 🔬": "gpt-4-turbo-preview", # Placeholder
@@ -35,11 +37,18 @@ LANGUAGES = {
35
  "🇮🇱 Hebrew": "Hebrew", "🇮🇳 Hindi": "Hindi", "🇯🇵 Japanese": "Japanese", "🇳🇿 Maori": "Maori",
36
  "🇷🇺 Russian": "Russian", "🇪🇸 Spanish": "Spanish"
37
  }
38
-
 
 
 
39
 
40
  # 🎨 STYLE
41
  H1 = "# <font size='7'>{0}</font>"
42
  H2 = "## <font size='6'>{0}</font>"
 
 
 
 
43
 
44
  # 🪄 HELPERS, LORE & AUTOSAVE RITUALS
45
  def save_state(data: dict):
@@ -76,124 +85,78 @@ def get_key(k: str) -> str:
76
  o.api_key = k
77
  return k
78
 
79
- def file_to_base64(file_path):
80
- with open(file_path, "rb") as f:
81
- return base64.b64encode(f.read()).decode('utf-8')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- def invoke_oracle(scribe_key: str, model_name: str, system_prompt: str, user_content: list, history: list):
84
- get_key(scribe_key)
85
- messages = history + [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}]
86
  try:
87
- prophecy = o.chat.completions.create(model=model_name, messages=messages, stream=True)
88
- history.append({"role": "user", "content": "..."})
89
- history.append({"role": "assistant", "content": ""})
90
- for chunk in prophecy:
91
- if chunk.choices[0].delta.content:
92
- history[-1]['content'] += chunk.choices[0].delta.content
93
- yield history
94
  except Exception as e:
95
- yield history + [{"role": "assistant", "content": f"🧙‍♂️🔮 A magical disturbance occurred: {str(e)}"}]
96
-
97
- # --- Modality-Specific Summoning Rituals ---
98
-
99
- def summon_vision_from_image(api_key, model, prompt, image_path, history):
100
- if image_path is None: raise gr.Error("An image must be provided.")
101
- b64_image = file_to_base64(image_path.name)
102
- user_content = [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64_image}"}}]
103
- yield from invoke_oracle(api_key, model, "You are an assistant that analyzes images. Respond in Markdown.", user_content, history)
104
-
105
- def summon_echo_from_audio(api_key, model, prompt, audio_path, history):
106
- if audio_path is None: raise gr.Error("An audio file must be provided.")
107
- get_key(api_key)
108
- with open(audio_path.name, "rb") as audio_file:
109
- transcription = o.audio.transcriptions.create(model="whisper-1", file=audio_file)
110
- full_prompt = f"{prompt}\n\n--- Transcription ---\n{transcription.text}"
111
- yield from invoke_oracle(api_key, model, "You analyze audio transcripts. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history)
112
-
113
- def summon_wisdom_from_text(api_key, model, prompt, file_path, history):
114
- if file_path is None: raise gr.Error("A file must be provided.")
115
- text_content = ""
116
- if file_path.name.lower().endswith('.pdf'):
117
- with fitz.open(file_path.name) as doc:
118
- text_content = "".join(page.get_text() for page in doc)
119
- else:
120
- with open(file_path.name, 'r', encoding='utf-8') as f:
121
- text_content = f.read()
122
- full_prompt = f"{prompt}\n\n--- Document Content ---\n{text_content[:10000]}..."
123
- yield from invoke_oracle(api_key, model, "You analyze documents. Respond in Markdown.", [{"type": "text", "text": full_prompt}], history)
124
 
125
- def summon_chronicle_from_video(api_key, model, prompt, video_path, history, progress=gr.Progress()):
126
- if video_path is None: raise gr.Error("A video must be provided.")
127
- get_key(api_key)
128
- base_video_path, _ = os.path.splitext(video_path.name)
129
- progress(0.1, desc="🔮 Extracting Audio...")
130
- audio_path = f"{base_video_path}.mp3"
131
- transcript_text = "No audio found."
132
- try:
133
- with VideoFileClip(video_path.name) as clip:
134
- clip.audio.write_audiofile(audio_path, bitrate="32k", logger=None)
135
- progress(0.3, desc="🎤 Transcribing Audio...")
136
- with open(audio_path, "rb") as audio_file:
137
- transcript_text = o.audio.transcriptions.create(model="whisper-1", file=audio_file).text
138
- except Exception as e:
139
- print(f"Audio failed: {e}")
140
- progress(0.6, desc="🖼️ Sampling Frames...")
141
- base64Frames = []
142
- video = cv2.VideoCapture(video_path.name)
143
- total_frames, fps = int(video.get(cv2.CAP_PROP_FRAME_COUNT)), video.get(cv2.CAP_PROP_FPS)
144
- frames_to_skip = int(fps * 2)
145
- for curr_frame in range(0, total_frames - 1, frames_to_skip):
146
- if len(base64Frames) >= 10: break
147
- video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
148
- success, frame = video.read()
149
- if not success: break
150
- _, buffer = cv2.imencode(".jpg", frame)
151
- base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
152
- video.release()
153
- progress(0.8, desc="🌀 Consulting Oracle...")
154
- user_content = [{"type": "text", "text": f"{prompt}\n\n--- Audio Transcript ---\n{transcript_text}"}, *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames)]
155
- yield from invoke_oracle(api_key, model, "You are a video analyst. Respond in Markdown.", user_content, history)
156
 
157
- def generate_speech(api_key, tts_model, voice, text, language, format, progress=gr.Progress()):
158
- """A ritual to give voice to the written word, in any tongue."""
159
  get_key(api_key)
160
-
161
- # Step 1: Translate the text if the language is not English
162
  progress(0.2, desc=f"Translating to {language}...")
163
  translated_text = text
164
  if language != "English":
165
  try:
166
- response = o.chat.completions.create(
167
- model="gpt-4o",
168
- messages=[
169
- {"role": "system", "content": f"You are a translator. Translate the following text to {language}. Output only the translated text."},
170
- {"role": "user", "content": text}
171
- ],
172
- temperature=0
173
- )
174
  translated_text = response.choices[0].message.content
175
  except Exception as e:
176
  raise gr.Error(f"Translation failed: {e}")
177
-
178
- # Step 2: Generate speech from the (possibly translated) text
179
  progress(0.6, desc="Summoning voice...")
180
  speech_file_path = Path(__file__).parent / f"speech.{format}"
181
  try:
182
- response = o.audio.speech.create(
183
- model=tts_model,
184
- voice=voice,
185
- input=translated_text,
186
- response_format=format
187
- )
188
  response.stream_to_file(speech_file_path)
189
  except Exception as e:
190
  raise gr.Error(f"Speech generation failed: {e}")
191
-
192
  progress(1.0, desc="Voice summoned!")
193
  return str(speech_file_path), translated_text
194
 
195
  # 🔮 UI
196
- with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary_hue="orange")) as demo:
197
  initial_state = load_state()
198
  app_state = gr.State(initial_state)
199
  gr.Markdown(H1.format(UI_TITLE))
@@ -206,40 +169,40 @@ with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary
206
  model_selector = gr.Dropdown(choices=list(MODELS.keys()), label="🔮 Oracle", value=initial_state.get('model', "GPT-4o ✨"))
207
  save_btn.click(save_key, inputs=api_key_box, outputs=status_txt)
208
 
209
- chatbot = gr.Chatbot(height=500, label="📜 Scroll of Conversation", type='messages', value=initial_state.get('chatbot', []))
210
 
211
  with gr.Tabs():
212
  with gr.TabItem("💬 Chat"):
213
  text_prompt = gr.Textbox(label="Your Quest:", placeholder="Type your message...", value=initial_state.get('text_prompt', ''))
214
- text_event = text_prompt.submit(fn=lambda api_key, model, prompt, hist: invoke_oracle(api_key, model, "You are a helpful AI assistant.", [{"type": "text", "text": prompt}], hist), inputs=[api_key_box, model_selector, text_prompt, chatbot], outputs=chatbot)
215
-
216
- with gr.TabItem("🖼️ Image"):
 
 
 
 
 
 
 
 
 
 
217
  with gr.Row():
218
- image_input = gr.File(label="Upload Image", type="file")
219
- image_output = gr.Image(label="Your Image", type="filepath", interactive=False)
220
- image_prompt = gr.Textbox(label="Image Prompt:", value=initial_state.get('image_prompt', "What is in this image?"))
221
- image_btn = gr.Button("👁️ Summon Vision")
222
- image_input.change(lambda x: x, inputs=image_input, outputs=image_output)
223
- image_event = image_btn.click(summon_vision_from_image, [api_key_box, model_selector, image_prompt, image_input, chatbot], chatbot)
 
 
 
 
 
 
 
 
224
 
225
- with gr.TabItem("🎤 Audio"):
226
- audio_input = gr.File(label="Upload Audio", type="file")
227
- audio_prompt = gr.Textbox(label="Audio Prompt:", value=initial_state.get('audio_prompt', "Summarize this audio."))
228
- audio_btn = gr.Button("🗣️ Summon Echo")
229
- audio_event = audio_btn.click(summon_echo_from_audio, [api_key_box, model_selector, audio_prompt, audio_input, chatbot], chatbot)
230
-
231
- with gr.TabItem("🎥 Video"):
232
- video_input = gr.File(label="Upload Video", type="file")
233
- video_prompt = gr.Textbox(label="Video Prompt:", value=initial_state.get('video_prompt', "Summarize this video."))
234
- video_btn = gr.Button("🎬 Summon Chronicle")
235
- video_event = video_btn.click(summon_chronicle_from_video, [api_key_box, model_selector, video_prompt, video_input, chatbot], chatbot)
236
-
237
- with gr.TabItem("📄 Document"):
238
- doc_input = gr.File(label="Upload PDF or TXT", type="file")
239
- doc_prompt = gr.Textbox(label="Document Prompt:", value=initial_state.get('doc_prompt', "Summarize this document."))
240
- doc_btn = gr.Button("📖 Summon Wisdom")
241
- doc_event = doc_btn.click(summon_wisdom_from_text, [api_key_box, model_selector, doc_prompt, doc_input, chatbot], chatbot)
242
-
243
  with gr.TabItem("🔊 Speech Synthesis"):
244
  gr.Markdown(H2.format("Give Voice to Words"))
245
  tts_language = gr.Radio(choices=list(LANGUAGES.keys()), label="🈯 Language", value=initial_state.get('tts_language', "🇬🇧 English"))
@@ -256,15 +219,15 @@ with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary
256
  # --- Autosave Event Listeners ---
257
  components_to_save = {
258
  'api_key': api_key_box, 'model': model_selector, 'text_prompt': text_prompt,
259
- 'image_prompt': image_prompt, 'audio_prompt': audio_prompt, 'video_prompt': video_prompt,
260
- 'doc_prompt': doc_prompt, 'tts_language': tts_language, 'tts_voice': tts_voice,
261
  'tts_model': tts_model_select, 'tts_format': tts_format, 'tts_text': tts_text_input
262
  }
263
  for key, component in components_to_save.items():
264
  component.change(update_and_save, [gr.State(key), component, app_state], app_state)
265
-
266
- for event in [text_event, image_event, audio_event, video_event, doc_event]:
267
- event.then(lambda history, state: update_and_save('chatbot', history, state), [chatbot, app_state], app_state)
268
 
269
  if __name__ == "__main__":
270
- demo.launch(share=True, debug=True)
 
 
 
 
11
  from io import BytesIO
12
  from PIL import Image
13
  from pathlib import Path
14
+ import numpy as np
15
+ from gradio_webrtc import WebRTC
16
 
17
  # 📜 CONFIG
18
  UI_TITLE = "✨🧙‍♂️🔮 GPT-4o Omni-Oracle"
 
20
  STATE_FILE = "app_state.json"
21
  MODELS = {
22
  "GPT-4o ✨": "gpt-4o",
23
+ "o3 (Advanced Reasoning) 🧠": "gpt-4-turbo", # Placeholder
24
  "o4-mini (Fastest) ⚡": "gpt-4-turbo", # Placeholder
25
  "o4-mini-high (Vision) 👁️‍🗨️": "gpt-4o", # Placeholder
26
  "GPT-4.5 (Research) 🔬": "gpt-4-turbo-preview", # Placeholder
 
37
  "🇮🇱 Hebrew": "Hebrew", "🇮🇳 Hindi": "Hindi", "🇯🇵 Japanese": "Japanese", "🇳🇿 Maori": "Maori",
38
  "🇷🇺 Russian": "Russian", "🇪🇸 Spanish": "Spanish"
39
  }
40
+ # For WebRTC - Replace with your own if deploying
41
+ RTC_CONFIGURATION = {
42
+ "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
43
+ }
44
 
45
  # 🎨 STYLE
46
  H1 = "# <font size='7'>{0}</font>"
47
  H2 = "## <font size='6'>{0}</font>"
48
+ CSS = """
49
+ .my-group {max-width: 500px !important; max-height: 500px !important;}
50
+ .my-column {display: flex !important; justify-content: center !important; align-items: center !important;}
51
+ """
52
 
53
  # 🪄 HELPERS, LORE & AUTOSAVE RITUALS
54
  def save_state(data: dict):
 
85
  o.api_key = k
86
  return k
87
 
88
+ # --- Image & Audio Streaming Functions ---
89
+
90
+ def transform_cv2(frame: np.ndarray, transform: str):
91
+ """Applies a magical filter to a single frame from a webcam stream."""
92
+ if transform == "cartoon":
93
+ img_color = cv2.pyrDown(cv2.pyrDown(frame))
94
+ for _ in range(6):
95
+ img_color = cv2.bilateralFilter(img_color, 9, 9, 7)
96
+ img_color = cv2.pyrUp(cv2.pyrUp(img_color))
97
+ img_edges = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
98
+ img_edges = cv2.adaptiveThreshold(
99
+ cv2.medianBlur(img_edges, 7), 255, cv2.ADAPTIVE_THRESH_MEAN_C,
100
+ cv2.THRESH_BINARY, 9, 2)
101
+ img_edges = cv2.cvtColor(img_edges, cv2.COLOR_GRAY2RGB)
102
+ return cv2.bitwise_and(img_color, img_edges)
103
+ elif transform == "edges":
104
+ return cv2.cvtColor(cv2.Canny(frame, 100, 200), cv2.COLOR_GRAY2BGR)
105
+ elif transform == "flip":
106
+ return np.flipud(frame)
107
+ return frame
108
+
109
+ def transcribe_streaming(audio_chunk, history_state):
110
+ """Transcribes a chunk of audio, keeping context from previous chunks."""
111
+ if audio_chunk is None:
112
+ return history_state, ""
113
+
114
+ # In a real scenario, you would use a streaming-capable ASR model.
115
+ # Here, we simulate it by transcribing each chunk individually.
116
+ # This is a placeholder for a more complex implementation.
117
+ get_key(os.getenv("OPENAI_KEY", "")) # Ensure API key is set
118
+
119
+ # Save chunk to a temporary file to use with OpenAI API
120
+ temp_wav_path = "temp_chunk.wav"
121
+ sample_rate, data = audio_chunk
122
+ import soundfile as sf
123
+ sf.write(temp_wav_path, data, sample_rate)
124
 
 
 
 
125
  try:
126
+ with open(temp_wav_path, "rb") as audio_file:
127
+ transcript = o.audio.transcriptions.create(model="whisper-1", file=audio_file)
128
+ new_text = transcript.text
 
 
 
 
129
  except Exception as e:
130
+ print(f"Transcription error: {e}")
131
+ new_text = "(...)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ history_state += new_text + " "
134
+ return history_state, history_state
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
+ # --- Other Functions (TTS, etc.) ---
137
+ def generate_speech(api_key, tts_model, voice, text, language_key, format, progress=gr.Progress()):
138
  get_key(api_key)
139
+ language = LANGUAGES.get(language_key, "English")
 
140
  progress(0.2, desc=f"Translating to {language}...")
141
  translated_text = text
142
  if language != "English":
143
  try:
144
+ response = o.chat.completions.create(model="gpt-4o", messages=[{"role": "system", "content": f"Translate to {language}. Output only the translation."}, {"role": "user", "content": text}], temperature=0)
 
 
 
 
 
 
 
145
  translated_text = response.choices[0].message.content
146
  except Exception as e:
147
  raise gr.Error(f"Translation failed: {e}")
 
 
148
  progress(0.6, desc="Summoning voice...")
149
  speech_file_path = Path(__file__).parent / f"speech.{format}"
150
  try:
151
+ response = o.audio.speech.create(model=tts_model, voice=voice, input=translated_text, response_format=format)
 
 
 
 
 
152
  response.stream_to_file(speech_file_path)
153
  except Exception as e:
154
  raise gr.Error(f"Speech generation failed: {e}")
 
155
  progress(1.0, desc="Voice summoned!")
156
  return str(speech_file_path), translated_text
157
 
158
  # 🔮 UI
159
+ with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary_hue="orange"), css=CSS) as demo:
160
  initial_state = load_state()
161
  app_state = gr.State(initial_state)
162
  gr.Markdown(H1.format(UI_TITLE))
 
169
  model_selector = gr.Dropdown(choices=list(MODELS.keys()), label="🔮 Oracle", value=initial_state.get('model', "GPT-4o ✨"))
170
  save_btn.click(save_key, inputs=api_key_box, outputs=status_txt)
171
 
172
+ chatbot = gr.Chatbot(height=400, label="📜 Scroll of Conversation", type='messages', value=initial_state.get('chatbot', []))
173
 
174
  with gr.Tabs():
175
  with gr.TabItem("💬 Chat"):
176
  text_prompt = gr.Textbox(label="Your Quest:", placeholder="Type your message...", value=initial_state.get('text_prompt', ''))
177
+ # This is a simplified invoke_oracle for text-only chat
178
+ text_event = text_prompt.submit(fn=lambda k, m, p, h: invoke_oracle(k, m, "You are a helpful AI.", [{"type": "text", "text": p}], h), inputs=[api_key_box, model_selector, text_prompt, chatbot], outputs=chatbot)
179
+
180
+ with gr.TabItem("🖼️ Streaming Image"):
181
+ gr.Markdown(H2.format("Live Image Enchantments"))
182
+ with gr.Column(elem_classes=["my-column"]):
183
+ with gr.Group(elem_classes=["my-group"]):
184
+ transform_filter = gr.Dropdown(choices=["cartoon", "edges", "flip"], value="flip", label="Transformation")
185
+ streaming_image = gr.Image(sources=["webcam"], type="numpy", streaming=True)
186
+ streaming_image.stream(transform_cv2, [streaming_image, transform_filter], streaming_image, time_limit=30, stream_every=0.1)
187
+
188
+ with gr.TabItem("🎤 Streaming Audio"):
189
+ gr.Markdown(H2.format("Real-time Transcription Rite"))
190
  with gr.Row():
191
+ mic_input = gr.Audio(sources="microphone", streaming=True)
192
+ transcript_output = gr.Textbox(label="Transcript", interactive=False)
193
+ transcript_state = gr.State(value="")
194
+ mic_input.stream(transcribe_streaming, [mic_input, transcript_state], [transcript_state, transcript_output], time_limit=20, stream_every=1)
195
+
196
+ with gr.TabItem("👁️ Object Detection"):
197
+ gr.Markdown(H2.format("Live Scrying with YOLOv10"))
198
+ gr.HTML("<h3 style='text-align: center'>Requires a separate inference server for YOLOv10. This is a UI placeholder.</h3>")
199
+ with gr.Column(elem_classes=["my-column"]):
200
+ with gr.Group(elem_classes=["my-group"]):
201
+ webrtc_stream = WebRTC(label="Stream", rtc_configuration=RTC_CONFIGURATION)
202
+ conf_threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, step=0.05, value=0.30)
203
+ # Placeholder for the actual stream event handler which would call the YOLOv10 model
204
+ # webrtc_stream.stream(fn=detection_placeholder, inputs=[webrtc_stream, conf_threshold], outputs=[webrtc_stream], time_limit=10)
205
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
  with gr.TabItem("🔊 Speech Synthesis"):
207
  gr.Markdown(H2.format("Give Voice to Words"))
208
  tts_language = gr.Radio(choices=list(LANGUAGES.keys()), label="🈯 Language", value=initial_state.get('tts_language', "🇬🇧 English"))
 
219
  # --- Autosave Event Listeners ---
220
  components_to_save = {
221
  'api_key': api_key_box, 'model': model_selector, 'text_prompt': text_prompt,
222
+ 'tts_language': tts_language, 'tts_voice': tts_voice,
 
223
  'tts_model': tts_model_select, 'tts_format': tts_format, 'tts_text': tts_text_input
224
  }
225
  for key, component in components_to_save.items():
226
  component.change(update_and_save, [gr.State(key), component, app_state], app_state)
227
+ text_event.then(lambda history, state: update_and_save('chatbot', history, state), [chatbot, app_state], app_state)
 
 
228
 
229
  if __name__ == "__main__":
230
+ # A placeholder function for the YOLOv10 detection since we don't have the model loaded here.
231
+ def detection_placeholder(image, conf):
232
+ return image # Just return the image as is.
233
+ demo.launch(share=True, debug=True)