awacke1 commited on
Commit
621f3cd
·
verified ·
1 Parent(s): b2df32a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -37
app.py CHANGED
@@ -12,7 +12,8 @@ from io import BytesIO
12
  from PIL import Image
13
  from pathlib import Path
14
  import numpy as np
15
- from gradio_webrtc import WebRTC
 
16
 
17
  # 📜 CONFIG
18
  UI_TITLE = "✨🧙‍♂️🔮 GPT-4o Omni-Oracle"
@@ -26,8 +27,6 @@ MODELS = {
26
  "GPT-4.5 (Research) 🔬": "gpt-4-turbo-preview", # Placeholder
27
  "GPT-4.1 (Analysis) 💻": "gpt-4-turbo", # Placeholder
28
  "GPT-4.1-mini (Everyday) ☕": "gpt-4-turbo", # Placeholder
29
- "GPT-4 Turbo 🚀": "gpt-4-turbo",
30
- "GPT-3.5 Turbo ⚡": "gpt-3.5-turbo",
31
  }
32
  VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"]
33
  TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"]
@@ -37,7 +36,7 @@ LANGUAGES = {
37
  "🇮🇱 Hebrew": "Hebrew", "🇮🇳 Hindi": "Hindi", "🇯🇵 Japanese": "Japanese", "🇳🇿 Maori": "Maori",
38
  "🇷🇺 Russian": "Russian", "🇪🇸 Spanish": "Spanish"
39
  }
40
- # For WebRTC - Replace with your own if deploying
41
  RTC_CONFIGURATION = {
42
  "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
43
  }
@@ -52,12 +51,10 @@ CSS = """
52
 
53
  # 🪄 HELPERS, LORE & AUTOSAVE RITUALS
54
  def save_state(data: dict):
55
- """A rune that inscribes the session's memory onto a JSON scroll."""
56
  with open(STATE_FILE, 'w') as f:
57
  json.dump(data, f, indent=4)
58
 
59
  def load_state() -> dict:
60
- """A ritual to recall the session's memory from the JSON scroll."""
61
  if os.path.exists(STATE_FILE):
62
  with open(STATE_FILE, 'r') as f:
63
  try:
@@ -67,37 +64,52 @@ def load_state() -> dict:
67
  return {}
68
 
69
  def update_and_save(key: str, value, state: dict):
70
- """A binding spell that updates a memory and immediately inscribes it."""
71
  state[key] = value
72
  save_state(state)
73
  return state
74
 
75
  def save_key(k: str) -> str:
76
- "💾🔑 A rune to bind the Eldritch Key."
77
  if not k or not k.strip(): return "🚫 Empty Key"
78
  with open(KEY_FILE, "w") as f: f.write(k.strip())
79
  return "🔑✅ Key Saved!"
80
 
81
  def get_key(k: str) -> str:
82
- "📜🔑 A ritual to summon the Eldritch Key."
83
  k = k.strip() if k and k.strip() else (open(KEY_FILE).read().strip() if os.path.exists(KEY_FILE) else os.getenv("OPENAI_KEY", ""))
84
  if not k: raise gr.Error("❗🔑 An Eldritch Key (OpenAI API Key) is required.")
85
  o.api_key = k
86
  return k
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  # --- Image & Audio Streaming Functions ---
89
 
90
  def transform_cv2(frame: np.ndarray, transform: str):
91
  """Applies a magical filter to a single frame from a webcam stream."""
 
92
  if transform == "cartoon":
93
  img_color = cv2.pyrDown(cv2.pyrDown(frame))
94
  for _ in range(6):
95
  img_color = cv2.bilateralFilter(img_color, 9, 9, 7)
96
  img_color = cv2.pyrUp(cv2.pyrUp(img_color))
97
  img_edges = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
98
- img_edges = cv2.adaptiveThreshold(
99
- cv2.medianBlur(img_edges, 7), 255, cv2.ADAPTIVE_THRESH_MEAN_C,
100
- cv2.THRESH_BINARY, 9, 2)
101
  img_edges = cv2.cvtColor(img_edges, cv2.COLOR_GRAY2RGB)
102
  return cv2.bitwise_and(img_color, img_edges)
103
  elif transform == "edges":
@@ -106,34 +118,27 @@ def transform_cv2(frame: np.ndarray, transform: str):
106
  return np.flipud(frame)
107
  return frame
108
 
109
- def transcribe_streaming(audio_chunk, history_state):
110
  """Transcribes a chunk of audio, keeping context from previous chunks."""
111
  if audio_chunk is None:
112
- return history_state, ""
113
-
114
- # In a real scenario, you would use a streaming-capable ASR model.
115
- # Here, we simulate it by transcribing each chunk individually.
116
- # This is a placeholder for a more complex implementation.
117
- get_key(os.getenv("OPENAI_KEY", "")) # Ensure API key is set
118
-
119
- # Save chunk to a temporary file to use with OpenAI API
120
- temp_wav_path = "temp_chunk.wav"
121
  sample_rate, data = audio_chunk
122
- import soundfile as sf
123
  sf.write(temp_wav_path, data, sample_rate)
124
-
125
  try:
126
  with open(temp_wav_path, "rb") as audio_file:
127
  transcript = o.audio.transcriptions.create(model="whisper-1", file=audio_file)
128
  new_text = transcript.text
129
  except Exception as e:
130
  print(f"Transcription error: {e}")
131
- new_text = "(...)"
132
-
 
 
133
  history_state += new_text + " "
134
  return history_state, history_state
135
 
136
- # --- Other Functions (TTS, etc.) ---
137
  def generate_speech(api_key, tts_model, voice, text, language_key, format, progress=gr.Progress()):
138
  get_key(api_key)
139
  language = LANGUAGES.get(language_key, "English")
@@ -174,8 +179,7 @@ with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary
174
  with gr.Tabs():
175
  with gr.TabItem("💬 Chat"):
176
  text_prompt = gr.Textbox(label="Your Quest:", placeholder="Type your message...", value=initial_state.get('text_prompt', ''))
177
- # This is a simplified invoke_oracle for text-only chat
178
- text_event = text_prompt.submit(fn=lambda k, m, p, h: invoke_oracle(k, m, "You are a helpful AI.", [{"type": "text", "text": p}], h), inputs=[api_key_box, model_selector, text_prompt, chatbot], outputs=chatbot)
179
 
180
  with gr.TabItem("🖼️ Streaming Image"):
181
  gr.Markdown(H2.format("Live Image Enchantments"))
@@ -191,16 +195,17 @@ with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary
191
  mic_input = gr.Audio(sources="microphone", streaming=True)
192
  transcript_output = gr.Textbox(label="Transcript", interactive=False)
193
  transcript_state = gr.State(value="")
194
- mic_input.stream(transcribe_streaming, [mic_input, transcript_state], [transcript_state, transcript_output], time_limit=20, stream_every=1)
195
 
196
- with gr.TabItem("👁️ Object Detection"):
197
- gr.Markdown(H2.format("Live Scrying with YOLOv10"))
198
- gr.HTML("<h3 style='text-align: center'>Requires a separate inference server for YOLOv10. This is a UI placeholder.</h3>")
199
  with gr.Column(elem_classes=["my-column"]):
200
  with gr.Group(elem_classes=["my-group"]):
201
  webrtc_stream = WebRTC(label="Stream", rtc_configuration=RTC_CONFIGURATION)
202
  conf_threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, step=0.05, value=0.30)
203
- # Placeholder for the actual stream event handler which would call the YOLOv10 model
 
204
  # webrtc_stream.stream(fn=detection_placeholder, inputs=[webrtc_stream, conf_threshold], outputs=[webrtc_stream], time_limit=10)
205
 
206
  with gr.TabItem("🔊 Speech Synthesis"):
@@ -227,7 +232,4 @@ with gr.Blocks(title=UI_TITLE, theme=gr.themes.Soft(primary_hue="red", secondary
227
  text_event.then(lambda history, state: update_and_save('chatbot', history, state), [chatbot, app_state], app_state)
228
 
229
  if __name__ == "__main__":
230
- # A placeholder function for the YOLOv10 detection since we don't have the model loaded here.
231
- def detection_placeholder(image, conf):
232
- return image # Just return the image as is.
233
  demo.launch(share=True, debug=True)
 
12
  from PIL import Image
13
  from pathlib import Path
14
  import numpy as np
15
+ from fastrtc.gradio import WebRTC
16
+ import soundfile as sf
17
 
18
  # 📜 CONFIG
19
  UI_TITLE = "✨🧙‍♂️🔮 GPT-4o Omni-Oracle"
 
27
  "GPT-4.5 (Research) 🔬": "gpt-4-turbo-preview", # Placeholder
28
  "GPT-4.1 (Analysis) 💻": "gpt-4-turbo", # Placeholder
29
  "GPT-4.1-mini (Everyday) ☕": "gpt-4-turbo", # Placeholder
 
 
30
  }
31
  VOICES = ["alloy", "ash", "ballad", "coral", "echo", "fable", "nova", "onyx", "sage", "shimmer"]
32
  TTS_MODELS = ["gpt-4o-mini-tts", "tts-1", "tts-1-hd"]
 
36
  "🇮🇱 Hebrew": "Hebrew", "🇮🇳 Hindi": "Hindi", "🇯🇵 Japanese": "Japanese", "🇳🇿 Maori": "Maori",
37
  "🇷🇺 Russian": "Russian", "🇪🇸 Spanish": "Spanish"
38
  }
39
+ # For WebRTC - Replace with your own if deploying on a cloud provider
40
  RTC_CONFIGURATION = {
41
  "iceServers": [{"urls": ["stun:stun.l.google.com:19302"]}]
42
  }
 
51
 
52
  # 🪄 HELPERS, LORE & AUTOSAVE RITUALS
53
  def save_state(data: dict):
 
54
  with open(STATE_FILE, 'w') as f:
55
  json.dump(data, f, indent=4)
56
 
57
  def load_state() -> dict:
 
58
  if os.path.exists(STATE_FILE):
59
  with open(STATE_FILE, 'r') as f:
60
  try:
 
64
  return {}
65
 
66
  def update_and_save(key: str, value, state: dict):
 
67
  state[key] = value
68
  save_state(state)
69
  return state
70
 
71
  def save_key(k: str) -> str:
 
72
  if not k or not k.strip(): return "🚫 Empty Key"
73
  with open(KEY_FILE, "w") as f: f.write(k.strip())
74
  return "🔑✅ Key Saved!"
75
 
76
  def get_key(k: str) -> str:
 
77
  k = k.strip() if k and k.strip() else (open(KEY_FILE).read().strip() if os.path.exists(KEY_FILE) else os.getenv("OPENAI_KEY", ""))
78
  if not k: raise gr.Error("❗🔑 An Eldritch Key (OpenAI API Key) is required.")
79
  o.api_key = k
80
  return k
81
 
82
+ def invoke_oracle(scribe_key: str, model_key: str, system_prompt: str, user_content: list, history: list):
83
+ get_key(scribe_key)
84
+ model_name = MODELS.get(model_key, "gpt-4o")
85
+ messages = history + [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_content}]
86
+ try:
87
+ prophecy = o.chat.completions.create(model=model_name, messages=messages, stream=True)
88
+ history.append({"role": "user", "content": "..."})
89
+ history.append({"role": "assistant", "content": ""})
90
+ for chunk in prophecy:
91
+ if chunk.choices[0].delta.content:
92
+ history[-1]['content'] += chunk.choices[0].delta.content
93
+ yield history
94
+ except Exception as e:
95
+ yield history + [{"role": "assistant", "content": f"🧙‍♂️🔮 A magical disturbance occurred: {str(e)}"}]
96
+
97
+ def handle_text_submission(api_key, model, prompt, history):
98
+ """A clear path for text quests to the Oracle."""
99
+ yield from invoke_oracle(api_key, model, "You are a helpful AI assistant.", [{"type": "text", "text": prompt}], history)
100
+
101
  # --- Image & Audio Streaming Functions ---
102
 
103
  def transform_cv2(frame: np.ndarray, transform: str):
104
  """Applies a magical filter to a single frame from a webcam stream."""
105
+ if frame is None: return None
106
  if transform == "cartoon":
107
  img_color = cv2.pyrDown(cv2.pyrDown(frame))
108
  for _ in range(6):
109
  img_color = cv2.bilateralFilter(img_color, 9, 9, 7)
110
  img_color = cv2.pyrUp(cv2.pyrUp(img_color))
111
  img_edges = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
112
+ img_edges = cv2.adaptiveThreshold(cv2.medianBlur(img_edges, 7), 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 2)
 
 
113
  img_edges = cv2.cvtColor(img_edges, cv2.COLOR_GRAY2RGB)
114
  return cv2.bitwise_and(img_color, img_edges)
115
  elif transform == "edges":
 
118
  return np.flipud(frame)
119
  return frame
120
 
121
+ def transcribe_streaming(api_key, audio_chunk, history_state):
122
  """Transcribes a chunk of audio, keeping context from previous chunks."""
123
  if audio_chunk is None:
124
+ return history_state, history_state
125
+ get_key(api_key)
 
 
 
 
 
 
 
126
  sample_rate, data = audio_chunk
127
+ temp_wav_path = f"temp_chunk_{hash(data.tobytes())}.wav"
128
  sf.write(temp_wav_path, data, sample_rate)
 
129
  try:
130
  with open(temp_wav_path, "rb") as audio_file:
131
  transcript = o.audio.transcriptions.create(model="whisper-1", file=audio_file)
132
  new_text = transcript.text
133
  except Exception as e:
134
  print(f"Transcription error: {e}")
135
+ new_text = ""
136
+ finally:
137
+ if os.path.exists(temp_wav_path):
138
+ os.remove(temp_wav_path)
139
  history_state += new_text + " "
140
  return history_state, history_state
141
 
 
142
  def generate_speech(api_key, tts_model, voice, text, language_key, format, progress=gr.Progress()):
143
  get_key(api_key)
144
  language = LANGUAGES.get(language_key, "English")
 
179
  with gr.Tabs():
180
  with gr.TabItem("💬 Chat"):
181
  text_prompt = gr.Textbox(label="Your Quest:", placeholder="Type your message...", value=initial_state.get('text_prompt', ''))
182
+ text_event = text_prompt.submit(fn=handle_text_submission, inputs=[api_key_box, model_selector, text_prompt, chatbot], outputs=chatbot)
 
183
 
184
  with gr.TabItem("🖼️ Streaming Image"):
185
  gr.Markdown(H2.format("Live Image Enchantments"))
 
195
  mic_input = gr.Audio(sources="microphone", streaming=True)
196
  transcript_output = gr.Textbox(label="Transcript", interactive=False)
197
  transcript_state = gr.State(value="")
198
+ mic_input.stream(transcribe_streaming, [api_key_box, mic_input, transcript_state], [transcript_state, transcript_output], time_limit=30, stream_every=2)
199
 
200
+ with gr.TabItem("👁️ Object Detection (WebRTC)"):
201
+ gr.Markdown(H2.format("Live Scrying Spell"))
202
+ gr.HTML("<h3 style='text-align: center'>NOTE: This is a UI placeholder. A separate inference server for the YOLO model is required for this to function.</h3>")
203
  with gr.Column(elem_classes=["my-column"]):
204
  with gr.Group(elem_classes=["my-group"]):
205
  webrtc_stream = WebRTC(label="Stream", rtc_configuration=RTC_CONFIGURATION)
206
  conf_threshold = gr.Slider(label="Confidence Threshold", minimum=0.0, maximum=1.0, step=0.05, value=0.30)
207
+ # Placeholder for the actual stream event handler which would call a loaded YOLOv10 model
208
+ # def detection_placeholder(image, conf): return image
209
  # webrtc_stream.stream(fn=detection_placeholder, inputs=[webrtc_stream, conf_threshold], outputs=[webrtc_stream], time_limit=10)
210
 
211
  with gr.TabItem("🔊 Speech Synthesis"):
 
232
  text_event.then(lambda history, state: update_and_save('chatbot', history, state), [chatbot, app_state], app_state)
233
 
234
  if __name__ == "__main__":
 
 
 
235
  demo.launch(share=True, debug=True)