awacke1 commited on
Commit
c252fa6
·
verified ·
1 Parent(s): 2825f65

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +624 -526
app.py CHANGED
@@ -1,25 +1,14 @@
1
- #!/usr/bin/env python
2
- """
3
- Combined Multimodal AI Suite
4
- - TorchTransformers-Diffusion-CV-SFT functionality (Camera, PDF, OCR, diffusion image gen, etc.)
5
- - GPT-4o Omni: Text, Audio, Image, Video processing with chat and paper search
6
- - Python Code Interpreter for code generation and execution
7
-
8
- This app integrates all modalities and adds an “Integrated Workflow” tab that enables you to:
9
- • Upload documents (e.g. double-page papers)
10
- • Extract text via OCR and image processing
11
- • Prompt GPT to generate Python code based on the extracted text
12
- • Display and execute the generated code
13
-
14
- Developed with Streamlit.
15
- """
16
-
17
  import aiofiles
18
  import asyncio
19
  import base64
 
20
  import fitz
21
  import glob
 
 
22
  import logging
 
 
23
  import os
24
  import pandas as pd
25
  import pytz
@@ -28,123 +17,72 @@ import re
28
  import requests
29
  import shutil
30
  import streamlit as st
 
 
 
31
  import time
32
  import torch
33
  import zipfile
34
 
 
 
 
 
35
  from dataclasses import dataclass
36
  from datetime import datetime
37
  from diffusers import StableDiffusionPipeline
 
 
 
38
  from io import BytesIO
 
39
  from openai import OpenAI
40
  from PIL import Image
 
41
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
42
  from typing import Optional
43
-
44
- # --- Additional Imports from GPT-4o Omni ---
45
- import cv2
46
- import json
47
- import streamlit.components.v1 as components
48
- import textract
49
- from audio_recorder_streamlit import audio_recorder
50
- from bs4 import BeautifulSoup
51
- from collections import deque
52
- from dotenv import load_dotenv
53
- from gradio_client import Client, handle_file
54
- from huggingface_hub import InferenceClient
55
- from moviepy import VideoFileClip
56
  from urllib.parse import quote
57
  from xml.etree import ElementTree as ET
58
- import openai
59
 
60
- # --- Code Interpreter Imports ---
61
- import io
62
- import sys
63
- from contextlib import redirect_stdout
64
- import mistune
65
 
66
- # Load environment variables
67
- load_dotenv()
 
 
 
 
 
 
68
 
69
- # ------------------ Global Configuration ------------------
70
  st.set_page_config(
71
- page_title="Combined Multimodal AI Suite 🚀",
72
  page_icon="🤖",
73
  layout="wide",
74
  initial_sidebar_state="expanded",
75
  menu_items={
76
  'Get Help': 'https://huggingface.co/awacke1',
77
  'Report a Bug': 'https://huggingface.co/spaces/awacke1',
78
- 'About': "Combined Multimodal AI Suite: Camera, OCR, Chat, Code Generation & Execution"
79
  }
80
  )
81
 
82
- # Setup logging
83
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
84
- logger = logging.getLogger(__name__)
85
- log_records = []
86
- class LogCaptureHandler(logging.Handler):
87
- def emit(self, record):
88
- log_records.append(record)
89
- logger.addHandler(LogCaptureHandler())
90
-
91
- # ------------------ Session State Defaults ------------------
92
- if 'history' not in st.session_state:
93
- st.session_state.history = []
94
- if 'messages' not in st.session_state:
95
- st.session_state.messages = []
96
- if 'gallery_files' not in st.session_state:
97
- st.session_state.gallery_files = []
98
- if 'builder' not in st.session_state:
99
- st.session_state.builder = None
100
- if 'model_loaded' not in st.session_state:
101
- st.session_state.model_loaded = False
102
- if 'processing' not in st.session_state:
103
- st.session_state.processing = {}
104
- if 'asset_checkboxes' not in st.session_state:
105
- st.session_state.asset_checkboxes = {}
106
- if 'downloaded_pdfs' not in st.session_state:
107
- st.session_state.downloaded_pdfs = {}
108
- if 'unique_counter' not in st.session_state:
109
- st.session_state.unique_counter = 0
110
-
111
- # ------------------ Utility Functions ------------------
112
- def generate_filename(prompt, file_type):
113
- """Generates a safe filename based on prompt and file type."""
114
- central = pytz.timezone('US/Central')
115
- safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
116
- replaced_prompt = prompt.replace(" ", "_").replace("\n", "_")
117
- safe_prompt = "".join(x for x in replaced_prompt if x.isalnum() or x == "_")[:90]
118
- return f"{safe_date_time}_{safe_prompt}.{file_type}"
119
-
120
- def get_download_link(file_path, mime_type="application/octet-stream", label="Download"):
121
- with open(file_path, "rb") as f:
122
- b64 = base64.b64encode(f.read()).decode()
123
- return f'<a href="data:{mime_type};base64,{b64}" download="{os.path.basename(file_path)}">{label}</a>'
124
-
125
- def zip_directory(directory_path, zip_path):
126
- with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
127
- for root, _, files in os.walk(directory_path):
128
- for file in files:
129
- zipf.write(os.path.join(root, file),
130
- os.path.relpath(os.path.join(root, file), os.path.dirname(directory_path)))
131
-
132
- def get_gallery_files(file_types=["png", "pdf", "md"]):
133
- return sorted(list({f for ext in file_types for f in glob.glob(f"*.{ext}")}))
134
-
135
- def download_pdf(url, output_path):
136
- try:
137
- response = requests.get(url, stream=True, timeout=10)
138
- if response.status_code == 200:
139
- with open(output_path, "wb") as f:
140
- for chunk in response.iter_content(chunk_size=8192):
141
- f.write(chunk)
142
- return True
143
- except requests.RequestException as e:
144
- logger.error(f"Failed to download {url}: {e}")
145
- return False
146
-
147
- # ------------------ Model & Diffusion Builders ------------------
148
  @dataclass
149
  class ModelConfig:
150
  name: str
@@ -153,7 +91,7 @@ class ModelConfig:
153
  domain: Optional[str] = None
154
  model_type: str = "causal_lm"
155
  @property
156
- def model_path(self):
157
  return f"models/{self.name}"
158
 
159
  @dataclass
@@ -174,59 +112,150 @@ class ModelBuilder:
174
  self.jokes = [
175
  "Why did the AI go to therapy? Too many layers to unpack! 😂",
176
  "Training complete! Time for a binary coffee break. ☕",
177
- "I told my neural network a joke; it couldn't stop dropping bits! 🤖"
 
 
178
  ]
179
  def load_model(self, model_path: str, config: Optional[ModelConfig] = None):
180
- with st.spinner(f"Loading model from {model_path}..."):
181
  self.model = AutoModelForCausalLM.from_pretrained(model_path)
182
  self.tokenizer = AutoTokenizer.from_pretrained(model_path)
183
  if self.tokenizer.pad_token is None:
184
  self.tokenizer.pad_token = self.tokenizer.eos_token
185
  if config:
186
  self.config = config
187
- device = "cuda" if torch.cuda.is_available() else "cpu"
188
- self.model.to(device)
189
- st.success(f"Model loaded! {random.choice(self.jokes)}")
190
  return self
191
  def save_model(self, path: str):
192
- with st.spinner("Saving model..."):
193
  os.makedirs(os.path.dirname(path), exist_ok=True)
194
  self.model.save_pretrained(path)
195
  self.tokenizer.save_pretrained(path)
196
- st.success(f"Model saved at {path}!")
197
-
198
  class DiffusionBuilder:
199
  def __init__(self):
200
  self.config = None
201
  self.pipeline = None
202
  def load_model(self, model_path: str, config: Optional[DiffusionConfig] = None):
203
- with st.spinner(f"Loading diffusion model from {model_path}..."):
204
  self.pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float32).to("cpu")
205
  if config:
206
  self.config = config
207
- st.success("Diffusion model loaded!")
208
  return self
209
  def save_model(self, path: str):
210
- with st.spinner("Saving diffusion model..."):
211
  os.makedirs(os.path.dirname(path), exist_ok=True)
212
  self.pipeline.save_pretrained(path)
213
- st.success(f"Diffusion model saved at {path}!")
214
  def generate(self, prompt: str):
215
  return self.pipeline(prompt, num_inference_steps=20).images[0]
216
 
217
- # ------------------ OCR & Image Processing Functions ------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  async def process_ocr(image, output_file):
219
  start_time = time.time()
220
  status = st.empty()
221
- status.text("Processing OCR... (0s)")
222
  tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True)
223
  model = AutoModel.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True, torch_dtype=torch.float32).to("cpu").eval()
224
- temp_file = f"temp_{int(time.time())}.png"
225
  image.save(temp_file)
226
  result = model.chat(tokenizer, temp_file, ocr_type='ocr')
227
  os.remove(temp_file)
228
  elapsed = int(time.time() - start_time)
229
- status.text(f"OCR completed in {elapsed}s!")
230
  async with aiofiles.open(output_file, "w") as f:
231
  await f.write(result)
232
  return result
@@ -234,15 +263,11 @@ async def process_ocr(image, output_file):
234
  async def process_image_gen(prompt, output_file):
235
  start_time = time.time()
236
  status = st.empty()
237
- status.text("Generating image... (0s)")
238
- # Use diffusion builder from session if available; otherwise load a default
239
- if st.session_state.get('builder') and isinstance(st.session_state.builder, DiffusionBuilder):
240
- pipeline = st.session_state.builder.pipeline
241
- else:
242
- pipeline = StableDiffusionPipeline.from_pretrained("OFA-Sys/small-stable-diffusion-v0", torch_dtype=torch.float32).to("cpu")
243
  gen_image = pipeline(prompt, num_inference_steps=20).images[0]
244
  elapsed = int(time.time() - start_time)
245
- status.text(f"Image generation completed in {elapsed}s!")
246
  gen_image.save(output_file)
247
  return gen_image
248
 
@@ -250,97 +275,117 @@ def process_image_with_prompt(image, prompt, model="gpt-4o-mini", detail="auto")
250
  buffered = BytesIO()
251
  image.save(buffered, format="PNG")
252
  img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
253
- messages = [{
254
- "role": "user",
255
- "content": [
256
- {"type": "text", "text": prompt},
257
- {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_str}", "detail": detail}}
258
- ]
259
- }]
260
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
261
  try:
262
  response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
263
  return response.choices[0].message.content
264
  except Exception as e:
265
- return f"Error: {str(e)}"
266
 
267
  def process_text_with_prompt(text, prompt, model="gpt-4o-mini"):
268
  messages = [{"role": "user", "content": f"{prompt}\n\n{text}"}]
269
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
270
  try:
271
  response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
272
  return response.choices[0].message.content
273
  except Exception as e:
274
- return f"Error: {str(e)}"
275
-
276
- # ------------------ PDF Processing Functions ------------------
277
- async def process_pdf_snapshot(pdf_path, mode="single"):
278
- start_time = time.time()
279
- status = st.empty()
280
- status.text(f"Processing PDF Snapshot ({mode})... (0s)")
281
- try:
282
- doc = fitz.open(pdf_path)
283
- output_files = []
284
- if mode == "single":
285
- page = doc[0]
286
- pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
287
- output_file = generate_filename("single_snapshot", "png")
288
- pix.save(output_file)
289
- output_files.append(output_file)
290
- elif mode == "twopage":
291
- for i in range(min(2, len(doc))):
292
- page = doc[i]
293
- pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
294
- output_file = generate_filename(f"twopage_{i}", "png")
295
- pix.save(output_file)
296
- output_files.append(output_file)
297
- elif mode == "allpages":
298
- for i in range(len(doc)):
299
- page = doc[i]
300
- pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
301
- output_file = generate_filename(f"page_{i}", "png")
302
- pix.save(output_file)
303
- output_files.append(output_file)
304
- doc.close()
305
- elapsed = int(time.time() - start_time)
306
- status.text(f"PDF Snapshot ({mode}) completed in {elapsed}s!")
307
- return output_files
308
- except Exception as e:
309
- status.error(f"Error: {str(e)}")
310
- return []
311
 
312
- # ------------------ GPT & Chat Functions ------------------
313
  def process_text(text_input):
314
  if text_input:
315
  st.session_state.messages.append({"role": "user", "content": text_input})
316
  with st.chat_message("user"):
317
  st.markdown(text_input)
318
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
319
  with st.chat_message("assistant"):
320
- completion = client.chat.completions.create(
321
- model="gpt-4o-2024-05-13",
322
- messages=st.session_state.messages,
323
- stream=False
324
- )
325
  return_text = completion.choices[0].message.content
326
  st.write("Assistant: " + return_text)
 
 
 
327
  st.session_state.messages.append({"role": "assistant", "content": return_text})
328
- return return_text
329
-
330
- def process_text2(text_input, model="gpt-4o-2024-05-13"):
331
- if text_input:
332
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
333
- completion = client.chat.completions.create(
334
- model=model,
335
- messages=st.session_state.messages,
336
- stream=False
337
- )
338
- return_text = completion.choices[0].message.content
339
- st.write("Assistant: " + return_text)
340
- st.session_state.messages.append({"role": "assistant", "content": return_text})
341
  return return_text
342
 
343
- # ------------------ Audio & Video Processing Functions ------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  def SpeechSynthesis(result):
345
  documentHTML5 = f'''
346
  <!DOCTYPE html>
@@ -365,352 +410,405 @@ def SpeechSynthesis(result):
365
  '''
366
  components.html(documentHTML5, width=1280, height=300)
367
 
368
- def process_audio(audio_input, text_input=''):
369
- if audio_input:
370
- # Save and read audio bytes
371
- with open("temp_audio.wav", "wb") as file:
372
- file.write(audio_input.getvalue())
373
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
374
- transcription = client.audio.transcriptions.create(model="whisper-1", file=open("temp_audio.wav", "rb"))
375
- st.session_state.messages.append({"role": "user", "content": transcription.text})
376
- with st.chat_message("assistant"):
377
- st.markdown(transcription.text)
378
- SpeechSynthesis(transcription.text)
379
- filename = generate_filename(transcription.text, "md")
380
- with open(filename, "w", encoding="utf-8") as f:
381
- f.write(transcription.text)
382
- return transcription.text
383
-
384
- def process_video_and_audio(video_input):
385
- if video_input:
386
- # Save video file
387
- video_path = video_input.name
388
- with open(video_path, "wb") as f:
389
- f.write(video_input.getbuffer())
390
- # Extract frames
391
- base64Frames = []
392
- video = cv2.VideoCapture(video_path)
393
- total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
394
- fps = video.get(cv2.CAP_PROP_FPS)
395
- frames_to_skip = int(fps * 1) # 1 second per frame
396
- curr_frame = 0
397
- while curr_frame < total_frames - 1:
398
- video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
399
- success, frame = video.read()
400
- if not success:
401
- break
402
- _, buffer = cv2.imencode(".jpg", frame)
403
- base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
404
- curr_frame += frames_to_skip
405
- video.release()
406
- # Audio transcription from video
407
- try:
408
- clip = VideoFileClip(video_path)
409
- audio_path = f"{os.path.splitext(video_path)[0]}.mp3"
410
- clip.audio.write_audiofile(audio_path, bitrate="32k")
411
- clip.audio.close()
412
- clip.close()
413
- with open(audio_path, "rb") as f:
414
- audio_data = f.read()
415
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
416
- transcription = client.audio.transcriptions.create(model="whisper-1", file=BytesIO(audio_data))
417
- except Exception as e:
418
- transcription = type("Dummy", (), {"text": "No transcript available."})()
419
- # Display frames and transcript
420
- st.markdown("### Video Frames")
421
- for frame_b64 in base64Frames:
422
- st.image(f"data:image/jpg;base64,{frame_b64}", use_container_width=True)
423
- st.markdown("### Audio Transcription")
424
- st.write(transcription.text)
425
- return transcription.text
426
-
427
- # ------------------ Python Code Executor Functions ------------------
428
- def extract_python_code(markdown_text):
429
- pattern = r"```python\s*(.*?)\s*```"
430
- matches = re.findall(pattern, markdown_text, re.DOTALL)
431
- return matches
432
-
433
- def execute_code(code):
434
- buffer = io.StringIO()
435
- local_vars = {}
436
- try:
437
- with redirect_stdout(buffer):
438
- exec(code, {}, local_vars)
439
- output = buffer.getvalue()
440
- return output, None
441
- except Exception as e:
442
- return None, str(e)
443
- finally:
444
- buffer.close()
445
-
446
- def create_and_save_file(filename, prompt, response, should_save=True):
447
- if not should_save:
448
- return
449
- base_filename, ext = os.path.splitext(filename)
450
- if ext in ['.txt', '.htm', '.md']:
451
- with open(f"{base_filename}.md", 'w', encoding='utf-8') as file:
452
- file.write(response)
453
-
454
- # ------------------ Integrated Workflow Function ------------------
455
- def integrated_workflow():
456
- st.header("Integrated Workflow: From Paper to Code")
457
- st.markdown("""
458
- 1. **Upload a PDF or Image** of a paper (double-page images work best).
459
- 2. **Run OCR** to extract text.
460
- 3. **Generate Python Code** based on the extracted text using GPT.
461
- 4. **Review and Execute** the generated code.
462
- """)
463
- uploaded_file = st.file_uploader("Upload PDF or Image", type=["pdf", "png", "jpg", "jpeg"], key="integrated_file")
464
- if uploaded_file:
465
- # Save the uploaded file
466
- file_path = f"uploaded_{uploaded_file.name}"
467
- with open(file_path, "wb") as f:
468
- f.write(uploaded_file.getvalue())
469
- st.success(f"Uploaded file saved as {file_path}")
470
- # If PDF, show first page snapshot; if image, load directly.
471
- if uploaded_file.type == "application/pdf":
472
- mode = st.selectbox("Snapshot Mode", ["single", "twopage", "allpages"])
473
- snapshots = asyncio.run(process_pdf_snapshot(file_path, mode))
474
- for snapshot in snapshots:
475
- st.image(Image.open(snapshot), caption=f"Snapshot: {snapshot}", use_container_width=True)
476
- else:
477
- st.image(Image.open(file_path), caption="Uploaded Image", use_container_width=True)
478
- # Run OCR on the file (using first page or the image itself)
479
- if st.button("Run OCR on File"):
480
- if uploaded_file.type == "application/pdf":
481
- doc = fitz.open(file_path)
482
- page = doc[0]
483
- pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
484
- temp_img = f"ocr_{os.path.basename(file_path)}.png"
485
- pix.save(temp_img)
486
- doc.close()
487
- image = Image.open(temp_img)
488
- else:
489
- image = Image.open(file_path)
490
- ocr_output_file = generate_filename("ocr_output", "txt")
491
- ocr_result = asyncio.run(process_ocr(image, ocr_output_file))
492
- st.text_area("OCR Output", ocr_result, height=200)
493
- # Use extracted OCR text as prompt to generate python code
494
- st.markdown("### Generate Python Code from OCR Text")
495
- code_prompt = st.text_area("Edit Prompt for Code Generation", value=f"Generate a Python script that processes the following scientific text:\n\n{ocr_result}", height=200)
496
- if st.button("Generate Code"):
497
- code_generated = process_text_with_prompt(ocr_result, code_prompt, model="gpt-4o-mini")
498
- st.code(code_generated, language="python")
499
- # Save generated code
500
- code_filename = generate_filename("generated_code", "py")
501
- with open(code_filename, "w", encoding="utf-8") as f:
502
- f.write(code_generated)
503
- st.markdown(get_download_link(code_filename, "text/plain", "Download Generated Code"), unsafe_allow_html=True)
504
- # Optionally execute the generated code
505
- if st.button("Execute Generated Code"):
506
- output, error = execute_code(code_generated)
507
- if error:
508
- st.error(f"Error executing code:\n{error}")
509
- else:
510
- st.success("Code executed successfully. Output:")
511
- st.code(output)
512
 
513
- # ------------------ Sidebar: Asset Gallery & Logs ------------------
514
- def update_gallery():
515
- container = st.sidebar.empty()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
516
  all_files = get_gallery_files()
517
  if all_files:
518
- container.markdown("### Asset Gallery")
519
- cols = container.columns(2)
520
- for idx, file in enumerate(all_files[:st.session_state.get('gallery_size', 5)]):
521
- with cols[idx % 2]:
522
  if file.endswith('.png'):
523
- st.image(Image.open(file), caption=os.path.basename(file), use_container_width=True)
524
  else:
525
- st.markdown(os.path.basename(file))
526
- if st.button("Delete "+os.path.basename(file), key="del_"+file):
527
- os.remove(file)
528
- st.experimental_rerun()
529
-
530
- update_gallery()
531
- st.sidebar.subheader("Action Logs")
532
- for record in log_records:
533
- st.sidebar.write(f"{record.asctime} - {record.levelname} - {record.message}")
534
-
535
- # ------------------ Main App Navigation ------------------
536
- st.title("Combined Multimodal AI Suite")
537
-
538
- tabs = st.tabs(["Home", "Camera & Images", "PDF & Documents", "Multimodal Chat", "Code Executor", "Integrated Workflow"])
539
-
540
- # --- Home Tab ---
541
- with tabs[0]:
542
- st.header("Welcome to the Combined Multimodal AI Suite")
543
- st.markdown("""
544
- This application integrates multiple AI functionalities:
545
-
546
- - **Camera & Image Processing:** Capture images, generate new images using diffusion models.
547
- - **PDF & Document Processing:** Download PDFs, perform OCR, and generate markdown summaries.
548
- - **Multimodal Chat:** Chat with GPT-4o using text, audio, image, and video inputs.
549
- - **Code Executor:** Write, generate, and execute Python code interactively.
550
- - **Integrated Workflow:** Seamlessly extract text from papers and generate & run Python code.
551
-
552
- Use the tabs above to explore each modality.
553
- """)
554
-
555
- # --- Camera & Images Tab ---
556
- with tabs[1]:
557
- st.header("Camera & Image Processing")
558
- st.subheader("Capture and Process Images")
559
- col1, col2 = st.columns(2)
560
- with col1:
561
- cam0_img = st.camera_input("Take a picture - Cam 0", key="cam0")
562
- if cam0_img:
563
- filename = generate_filename("cam0_snapshot", "png")
564
- with open(filename, "wb") as f:
565
- f.write(cam0_img.getvalue())
566
- st.image(Image.open(filename), caption="Camera 0 Snapshot", use_container_width=True)
567
- st.session_state.history.append(f"Captured {filename}")
568
- with col2:
569
- cam1_img = st.camera_input("Take a picture - Cam 1", key="cam1")
570
- if cam1_img:
571
- filename = generate_filename("cam1_snapshot", "png")
572
- with open(filename, "wb") as f:
573
- f.write(cam1_img.getvalue())
574
- st.image(Image.open(filename), caption="Camera 1 Snapshot", use_container_width=True)
575
- st.session_state.history.append(f"Captured {filename}")
576
- st.markdown("---")
577
- st.subheader("Generate New Image with Diffusion")
578
- prompt_img = st.text_input("Enter prompt for image generation", "A neon futuristic cityscape")
579
- if st.button("Generate Image"):
580
- output_file = generate_filename("gen_output", "png")
581
- result_img = asyncio.run(process_image_gen(prompt_img, output_file))
582
- st.image(result_img, caption="Generated Image", use_container_width=True)
583
-
584
- # --- PDF & Documents Tab ---
585
- with tabs[2]:
586
- st.header("PDF & Document Processing")
587
- st.subheader("Download and Process PDFs")
588
- url_input = st.text_area("Enter PDF URLs (one per line)", height=100)
589
- if st.button("Download PDFs"):
590
- urls = [u.strip() for u in url_input.splitlines() if u.strip()]
591
- progress_bar = st.progress(0)
592
- for idx, url in enumerate(urls):
593
- output_path = generate_filename(url, "pdf")
594
- if download_pdf(url, output_path):
595
- st.session_state.downloaded_pdfs[url] = output_path
596
- st.success(f"Downloaded: {output_path}")
597
- progress_bar.progress((idx + 1) / len(urls))
598
- st.markdown("---")
599
- st.subheader("OCR & PDF Snapshot")
600
- all_assets = get_gallery_files()
601
- selected_asset = st.selectbox("Select an asset", all_assets) if all_assets else None
602
- if selected_asset and st.button("Run OCR on Selected"):
603
- if selected_asset.endswith('.png'):
604
- image = Image.open(selected_asset)
605
- else:
606
- doc = fitz.open(selected_asset)
607
- pix = doc[0].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
608
- image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
609
- doc.close()
610
- output_file = generate_filename("ocr_output", "txt")
611
- ocr_result = asyncio.run(process_ocr(image, output_file))
612
- st.text_area("OCR Result", ocr_result, height=200)
613
- st.markdown("---")
614
- st.subheader("Markdown Gallery")
615
- md_files = sorted(glob.glob("*.md"))
616
- if md_files:
617
- for md in md_files:
618
- st.markdown(f"**{md}**")
619
- st.markdown(get_download_link(md, "text/markdown", "Download MD"), unsafe_allow_html=True)
620
-
621
- # --- Multimodal Chat Tab ---
622
- with tabs[3]:
623
- st.header("Multimodal Chat")
624
- st.markdown("Chat with GPT-4o using text, audio, image, or video inputs.")
625
- mode = st.selectbox("Select Mode", ["Text", "Image", "Audio", "Video"])
626
- if mode == "Text":
627
- text_input = st.text_input("Enter your text prompt")
628
- if st.button("Send Text"):
629
- response = process_text(text_input)
630
- st.markdown(response)
631
- elif mode == "Image":
632
- text_prompt = st.text_input("Enter prompt for image analysis", "Describe this image and list 10 facts.")
633
- image_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"], key="chat_image")
634
- if image_file:
635
- image = Image.open(image_file)
636
- st.image(image, caption="Uploaded Image", use_container_width=True)
637
- response = process_image_with_prompt(image, text_prompt)
638
- st.markdown(response)
639
- elif mode == "Audio":
640
- st.markdown("Record or upload an audio file for transcription.")
641
- audio_bytes = audio_recorder()
642
- if audio_bytes:
643
- st.audio(audio_bytes, format="audio/wav")
644
- transcription = process_audio(audio_bytes)
645
- st.markdown(transcription)
646
- elif mode == "Video":
647
- video_file = st.file_uploader("Upload a video file", type=["mp4", "webm"], key="chat_video")
648
- if video_file:
649
- transcript = process_video_and_audio(video_file)
650
- st.markdown("Video Transcript:")
651
- st.write(transcript)
652
-
653
- st.markdown("---")
654
- st.subheader("Chat History")
655
- for msg in st.session_state.messages:
656
- with st.chat_message(msg["role"]):
657
- st.markdown(msg["content"])
658
-
659
- # --- Code Executor Tab ---
660
- with tabs[4]:
661
- st.header("Python Code Executor")
662
- st.markdown("Enter Python code below or upload a .py/.md file. The code will be executed in a sandboxed environment.")
663
- uploaded_file = st.file_uploader("Upload Python (.py) or Markdown (.md) file", type=["py", "md"], key="code_file")
664
  if 'code' not in st.session_state:
665
- st.session_state.code = """import streamlit as st
666
- st.write("Hello from the Python Code Executor!")"""
667
- if uploaded_file is None:
668
- code_input = st.text_area("Python Code Editor:", value=st.session_state.code, height=400, key="code_editor")
669
- else:
670
  content = uploaded_file.getvalue().decode()
671
  if uploaded_file.type == "text/markdown":
672
  code_blocks = extract_python_code(content)
673
- if code_blocks:
674
- code_input = code_blocks[0]
675
- else:
676
- st.error("No Python code block found in the markdown file!")
677
- code_input = ""
678
  else:
679
  code_input = content
680
- st.code(code_input, language='python')
681
- col1, col2 = st.columns([1,1])
 
682
  with col1:
683
  if st.button("▶️ Run Code"):
684
- if code_input:
685
- output, error = execute_code(code_input)
686
- if error:
687
- st.error(f"Error:\n{error}")
688
- elif output:
689
- st.code(output)
690
- else:
691
- st.success("Code executed with no output.")
692
  else:
693
- st.warning("Please enter some code!")
694
  with col2:
695
  if st.button("🗑️ Clear Code"):
696
  st.session_state.code = ""
697
- st.experimental_rerun()
698
- with st.expander("How to use the Code Executor"):
699
- st.markdown("""
700
- - Enter or upload Python code.
701
- - Click **Run Code** to execute.
702
- - The output (or any errors) will be displayed below.
703
- """)
704
-
705
- # --- Integrated Workflow Tab ---
706
- with tabs[5]:
707
- integrated_workflow()
708
-
709
- # ------------------ Chat Input at Bottom ------------------
710
- if prompt := st.chat_input("GPT-4o Multimodal ChatBot - How can I help you?"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  st.session_state.messages.append({"role": "user", "content": prompt})
712
  with st.chat_message("user"):
713
  st.markdown(prompt)
714
  with st.chat_message("assistant"):
715
- response = process_text2(prompt)
716
- st.session_state.messages.append({"role": "assistant", "content": response})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import aiofiles
2
  import asyncio
3
  import base64
4
+ import cv2
5
  import fitz
6
  import glob
7
+ import io
8
+ import json
9
  import logging
10
+ import math
11
+ import mistune
12
  import os
13
  import pandas as pd
14
  import pytz
 
17
  import requests
18
  import shutil
19
  import streamlit as st
20
+ import streamlit.components.v1 as components
21
+ import sys
22
+ import textract
23
  import time
24
  import torch
25
  import zipfile
26
 
27
+ from audio_recorder_streamlit import audio_recorder
28
+ from bs4 import BeautifulSoup
29
+ from collections import deque
30
+ from contextlib import redirect_stdout
31
  from dataclasses import dataclass
32
  from datetime import datetime
33
  from diffusers import StableDiffusionPipeline
34
+ from dotenv import load_dotenv
35
+ from gradio_client import Client, handle_file
36
+ from huggingface_hub import InferenceClient
37
  from io import BytesIO
38
+ from moviepy import VideoFileClip
39
  from openai import OpenAI
40
  from PIL import Image
41
+ from PyPDF2 import PdfReader
42
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
43
  from typing import Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  from urllib.parse import quote
45
  from xml.etree import ElementTree as ET
 
46
 
47
+ # OpenAI client initialization
48
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
 
 
 
49
 
50
+ # Logging setup
51
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
52
+ logger = logging.getLogger(__name__)
53
+ log_records = []
54
+ class LogCaptureHandler(logging.Handler):
55
+ def emit(self, record):
56
+ log_records.append(record)
57
+ logger.addHandler(LogCaptureHandler())
58
 
59
+ # Streamlit configuration
60
  st.set_page_config(
61
+ page_title="AI Multimodal Titan 🚀",
62
  page_icon="🤖",
63
  layout="wide",
64
  initial_sidebar_state="expanded",
65
  menu_items={
66
  'Get Help': 'https://huggingface.co/awacke1',
67
  'Report a Bug': 'https://huggingface.co/spaces/awacke1',
68
+ 'About': "AI Multimodal Titan: PDFs, OCR, Image Gen, Audio/Video, Code Execution, and More! 🌌"
69
  }
70
  )
71
 
72
+ # Session state initialization
73
+ for key in ['history', 'messages', 'processing', 'asset_checkboxes', 'downloaded_pdfs', 'unique_counter', 'search_queries']:
74
+ st.session_state.setdefault(key, [] if key in ['history', 'messages', 'search_queries'] else {} if key in ['asset_checkboxes', 'downloaded_pdfs', 'processing'] else 0 if key == 'unique_counter' else None)
75
+ st.session_state.setdefault('builder', None)
76
+ st.session_state.setdefault('model_loaded', False)
77
+ st.session_state.setdefault('selected_model_type', "Causal LM")
78
+ st.session_state.setdefault('selected_model', "None")
79
+ st.session_state.setdefault('gallery_size', 2)
80
+ st.session_state.setdefault('asset_gallery_container', st.sidebar.empty())
81
+ st.session_state.setdefault('cam0_file', None)
82
+ st.session_state.setdefault('cam1_file', None)
83
+ st.session_state.setdefault('openai_model', "gpt-4o-2024-05-13")
84
+
85
+ # Model configurations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  @dataclass
87
  class ModelConfig:
88
  name: str
 
91
  domain: Optional[str] = None
92
  model_type: str = "causal_lm"
93
  @property
94
+ def model_path(self):
95
  return f"models/{self.name}"
96
 
97
  @dataclass
 
112
  self.jokes = [
113
  "Why did the AI go to therapy? Too many layers to unpack! 😂",
114
  "Training complete! Time for a binary coffee break. ☕",
115
+ "I told my neural network a joke; it couldn't stop dropping bits! 🤖",
116
+ "I asked the AI for a pun, and it said, 'I'm punning on parallel processing!' 😄",
117
+ "Debugging my code is like a stand-up routine—always a series of exceptions! 😆"
118
  ]
119
  def load_model(self, model_path: str, config: Optional[ModelConfig] = None):
120
+ with st.spinner(f"Loading {model_path}..."):
121
  self.model = AutoModelForCausalLM.from_pretrained(model_path)
122
  self.tokenizer = AutoTokenizer.from_pretrained(model_path)
123
  if self.tokenizer.pad_token is None:
124
  self.tokenizer.pad_token = self.tokenizer.eos_token
125
  if config:
126
  self.config = config
127
+ self.model.to("cuda" if torch.cuda.is_available() else "cpu")
128
+ st.success(f"Model loaded! 🎉 {random.choice(self.jokes)}")
 
129
  return self
130
  def save_model(self, path: str):
131
+ with st.spinner("Saving model... 💾"):
132
  os.makedirs(os.path.dirname(path), exist_ok=True)
133
  self.model.save_pretrained(path)
134
  self.tokenizer.save_pretrained(path)
135
+ st.success(f"Model saved at {path}!")
136
+
137
  class DiffusionBuilder:
138
  def __init__(self):
139
  self.config = None
140
  self.pipeline = None
141
  def load_model(self, model_path: str, config: Optional[DiffusionConfig] = None):
142
+ with st.spinner(f"Loading diffusion model {model_path}..."):
143
  self.pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float32).to("cpu")
144
  if config:
145
  self.config = config
146
+ st.success("Diffusion model loaded! 🎨")
147
  return self
148
  def save_model(self, path: str):
149
+ with st.spinner("Saving diffusion model... 💾"):
150
  os.makedirs(os.path.dirname(path), exist_ok=True)
151
  self.pipeline.save_pretrained(path)
152
+ st.success(f"Diffusion model saved at {path}!")
153
  def generate(self, prompt: str):
154
  return self.pipeline(prompt, num_inference_steps=20).images[0]
155
 
156
+ # Utility functions
157
+ def generate_filename(sequence, ext="png", prompt=None):
158
+ central = pytz.timezone('US/Central')
159
+ safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
160
+ if prompt:
161
+ safe_prompt = re.sub(r'[<>:"/\\|?*\n]', '_', prompt)[:240]
162
+ return f"{safe_date_time}_{safe_prompt}.{ext}"
163
+ return f"{sequence}_{time.strftime('%d%m%Y%H%M%S')}.{ext}"
164
+
165
+ def pdf_url_to_filename(url):
166
+ return re.sub(r'[<>:"/\\|?*]', '_', url) + ".pdf"
167
+
168
+ def get_download_link(file_path, mime_type="application/pdf", label="Download"):
169
+ with open(file_path, "rb") as f:
170
+ data = base64.b64encode(f.read()).decode()
171
+ return f'<a href="data:{mime_type};base64,{data}" download="{os.path.basename(file_path)}">{label}</a>'
172
+
173
+ def zip_directory(directory_path, zip_path):
174
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
175
+ for root, _, files in os.walk(directory_path):
176
+ for file in files:
177
+ zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.dirname(directory_path)))
178
+
179
+ def get_model_files(model_type="causal_lm"):
180
+ return [d for d in glob.glob("models/*" if model_type == "causal_lm" else "diffusion_models/*") if os.path.isdir(d)] or ["None"]
181
+
182
+ def get_gallery_files(file_types=["png", "pdf", "md", "wav", "mp4"]):
183
+ return sorted(list({f for ext in file_types for f in glob.glob(f"*.{ext}")}))
184
+
185
+ def get_pdf_files():
186
+ return sorted(glob.glob("*.pdf"))
187
+
188
+ def download_pdf(url, output_path):
189
+ try:
190
+ response = requests.get(url, stream=True, timeout=10)
191
+ if response.status_code == 200:
192
+ with open(output_path, "wb") as f:
193
+ for chunk in response.iter_content(chunk_size=8192):
194
+ f.write(chunk)
195
+ return True
196
+ except requests.RequestException as e:
197
+ logger.error(f"Failed to download {url}: {e}")
198
+ return False
199
+
200
+ # Processing functions
201
+ async def process_pdf_snapshot(pdf_path, mode="single"):
202
+ start_time = time.time()
203
+ status = st.empty()
204
+ status.text(f"Processing PDF Snapshot ({mode})... (0s)")
205
+ try:
206
+ doc = fitz.open(pdf_path)
207
+ output_files = []
208
+ if mode == "single":
209
+ page = doc[0]
210
+ pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
211
+ output_file = generate_filename("single", "png")
212
+ pix.save(output_file)
213
+ output_files.append(output_file)
214
+ elif mode == "twopage":
215
+ if len(doc) >= 2:
216
+ pix1 = doc[0].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
217
+ pix2 = doc[1].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
218
+ img1 = Image.frombytes("RGB", [pix1.width, pix1.height], pix1.samples)
219
+ img2 = Image.frombytes("RGB", [pix2.width, pix2.height], pix2.samples)
220
+ combined_img = Image.new("RGB", (pix1.width + pix2.width, max(pix1.height, pix2.height)))
221
+ combined_img.paste(img1, (0, 0))
222
+ combined_img.paste(img2, (pix1.width, 0))
223
+ output_file = generate_filename("twopage", "png")
224
+ combined_img.save(output_file)
225
+ output_files.append(output_file)
226
+ else:
227
+ page = doc[0]
228
+ pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
229
+ output_file = generate_filename("single", "png")
230
+ pix.save(output_file)
231
+ output_files.append(output_file)
232
+ elif mode == "allpages":
233
+ for i in range(len(doc)):
234
+ page = doc[i]
235
+ pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
236
+ output_file = generate_filename(f"page_{i}", "png")
237
+ pix.save(output_file)
238
+ output_files.append(output_file)
239
+ doc.close()
240
+ elapsed = int(time.time() - start_time)
241
+ status.text(f"PDF Snapshot ({mode}) completed in {elapsed}s!")
242
+ return output_files
243
+ except Exception as e:
244
+ status.error(f"Failed to process PDF: {str(e)}")
245
+ return []
246
+
247
  async def process_ocr(image, output_file):
248
  start_time = time.time()
249
  status = st.empty()
250
+ status.text("Processing GOT-OCR2_0... (0s)")
251
  tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True)
252
  model = AutoModel.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True, torch_dtype=torch.float32).to("cpu").eval()
253
+ temp_file = generate_filename("temp", "png")
254
  image.save(temp_file)
255
  result = model.chat(tokenizer, temp_file, ocr_type='ocr')
256
  os.remove(temp_file)
257
  elapsed = int(time.time() - start_time)
258
+ status.text(f"GOT-OCR2_0 completed in {elapsed}s!")
259
  async with aiofiles.open(output_file, "w") as f:
260
  await f.write(result)
261
  return result
 
263
  async def process_image_gen(prompt, output_file):
264
  start_time = time.time()
265
  status = st.empty()
266
+ status.text("Processing Image Gen... (0s)")
267
+ pipeline = (st.session_state['builder'].pipeline if st.session_state.get('builder') and isinstance(st.session_state['builder'], DiffusionBuilder) and st.session_state['builder'].pipeline else StableDiffusionPipeline.from_pretrained("OFA-Sys/small-stable-diffusion-v0", torch_dtype=torch.float32).to("cpu"))
 
 
 
 
268
  gen_image = pipeline(prompt, num_inference_steps=20).images[0]
269
  elapsed = int(time.time() - start_time)
270
+ status.text(f"Image Gen completed in {elapsed}s!")
271
  gen_image.save(output_file)
272
  return gen_image
273
 
 
275
  buffered = BytesIO()
276
  image.save(buffered, format="PNG")
277
  img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
278
+ messages = [{"role": "user", "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_str}", "detail": detail}}]}]
 
 
 
 
 
 
 
279
  try:
280
  response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
281
  return response.choices[0].message.content
282
  except Exception as e:
283
+ return f"Error processing image with GPT: {str(e)}"
284
 
285
  def process_text_with_prompt(text, prompt, model="gpt-4o-mini"):
286
  messages = [{"role": "user", "content": f"{prompt}\n\n{text}"}]
 
287
  try:
288
  response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
289
  return response.choices[0].message.content
290
  except Exception as e:
291
+ return f"Error processing text with GPT: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
 
293
  def process_text(text_input):
294
  if text_input:
295
  st.session_state.messages.append({"role": "user", "content": text_input})
296
  with st.chat_message("user"):
297
  st.markdown(text_input)
 
298
  with st.chat_message("assistant"):
299
+ completion = client.chat.completions.create(model=st.session_state["openai_model"], messages=[{"role": m["role"], "content": m["content"]} for m in st.session_state.messages], stream=False)
 
 
 
 
300
  return_text = completion.choices[0].message.content
301
  st.write("Assistant: " + return_text)
302
+ filename = generate_filename(text_input, "md")
303
+ with open(filename, "w", encoding="utf-8") as f:
304
+ f.write(text_input + "\n\n" + return_text)
305
  st.session_state.messages.append({"role": "assistant", "content": return_text})
 
 
 
 
 
 
 
 
 
 
 
 
 
306
  return return_text
307
 
308
+ def process_audio(audio_input, text_input=''):
309
+ if isinstance(audio_input, str):
310
+ with open(audio_input, "rb") as file:
311
+ audio_input = file.read()
312
+ transcription = client.audio.transcriptions.create(model="whisper-1", file=audio_input)
313
+ st.session_state.messages.append({"role": "user", "content": transcription.text})
314
+ with st.chat_message("assistant"):
315
+ st.markdown(transcription.text)
316
+ SpeechSynthesis(transcription.text)
317
+ filename = generate_filename(transcription.text, "wav")
318
+ create_audio_file(filename, audio_input, True)
319
+ filename = generate_filename(transcription.text, "md")
320
+ with open(filename, "w", encoding="utf-8") as f:
321
+ f.write(transcription.text + "\n\n" + transcription.text)
322
+ return transcription.text
323
+
324
+ def process_video(video_path, user_prompt):
325
+ base64Frames, audio_path = process_video_frames(video_path)
326
+ with open(video_path, "rb") as file:
327
+ transcription = client.audio.transcriptions.create(model="whisper-1", file=file)
328
+ response = client.chat.completions.create(
329
+ model=st.session_state["openai_model"],
330
+ messages=[
331
+ {"role": "system", "content": "You are generating a video summary. Create a summary of the provided video and its transcript. Respond in Markdown"},
332
+ {"role": "user", "content": [
333
+ "These are the frames from the video.",
334
+ *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames),
335
+ {"type": "text", "text": f"The audio transcription is: {transcription.text}\n\n{user_prompt}"}
336
+ ]}
337
+ ],
338
+ temperature=0,
339
+ )
340
+ video_response = response.choices[0].message.content
341
+ filename_md = generate_filename(video_path + '- ' + video_response, "md")
342
+ with open(filename_md, "w", encoding="utf-8") as f:
343
+ f.write(video_response)
344
+ return video_response
345
+
346
+ def process_video_frames(video_path, seconds_per_frame=2):
347
+ base64Frames = []
348
+ base_video_path, _ = os.path.splitext(video_path)
349
+ video = cv2.VideoCapture(video_path)
350
+ total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
351
+ fps = video.get(cv2.CAP_PROP_FPS)
352
+ frames_to_skip = int(fps * seconds_per_frame)
353
+ curr_frame = 0
354
+ while curr_frame < total_frames - 1:
355
+ video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
356
+ success, frame = video.read()
357
+ if not success:
358
+ break
359
+ _, buffer = cv2.imencode(".jpg", frame)
360
+ base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
361
+ curr_frame += frames_to_skip
362
+ video.release()
363
+ audio_path = f"{base_video_path}.mp3"
364
+ try:
365
+ clip = VideoFileClip(video_path)
366
+ clip.audio.write_audiofile(audio_path, bitrate="32k")
367
+ clip.audio.close()
368
+ clip.close()
369
+ except:
370
+ logger.info("No audio track found in video.")
371
+ return base64Frames, audio_path
372
+
373
+ def execute_code(code):
374
+ buffer = io.StringIO()
375
+ try:
376
+ with redirect_stdout(buffer):
377
+ exec(code, {}, {})
378
+ return buffer.getvalue(), None
379
+ except Exception as e:
380
+ return None, str(e)
381
+ finally:
382
+ buffer.close()
383
+
384
+ def extract_python_code(markdown_text):
385
+ pattern = r"```python\s*(.*?)\s*```"
386
+ matches = re.findall(pattern, markdown_text, re.DOTALL)
387
+ return matches
388
+
389
  def SpeechSynthesis(result):
390
  documentHTML5 = f'''
391
  <!DOCTYPE html>
 
410
  '''
411
  components.html(documentHTML5, width=1280, height=300)
412
 
413
+ def search_arxiv(query):
414
+ start_time = time.strftime("%Y-%m-%d %H:%M:%S")
415
+ client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
416
+ response1 = client.predict(message="Hello!!", llm_results_use=5, database_choice="Semantic Search", llm_model_picked="mistralai/Mistral-7B-Instruct-v0.2", api_name="/update_with_rag_md")
417
+ Question = f'### 🔎 {query}\r\n'
418
+ References = response1[0]
419
+ References2 = response1[1]
420
+ filename = generate_filename(query, "md")
421
+ with open(filename, "w", encoding="utf-8") as f:
422
+ f.write(Question + References + References2)
423
+ st.session_state.messages.append({"role": "assistant", "content": References + References2})
424
+ response2 = client.predict(query, "mistralai/Mixtral-8x7B-Instruct-v0.1", True, api_name="/ask_llm")
425
+ if len(response2) > 10:
426
+ Answer = response2
427
+ SpeechSynthesis(Answer)
428
+ results = Question + '\r\n' + Answer + '\r\n' + References + '\r\n' + References2
429
+ return results
430
+ return References + References2
431
+
432
+ roleplaying_glossary = {
433
+ "🤖 AI Concepts": {
434
+ "MoE (Mixture of Experts) 🧠": [
435
+ "As a leading AI health researcher, provide an overview of MoE, MAS, memory, and mirroring in healthcare applications.",
436
+ "Explain how MoE and MAS can be leveraged to create AGI and AMI systems for healthcare, as an AI architect."
437
+ ],
438
+ "Multi Agent Systems (MAS) 🤝": [
439
+ "As a renowned MAS researcher, describe the key characteristics of distributed, autonomous, and cooperative MAS.",
440
+ "Discuss how MAS is applied in robotics, simulations, and decentralized problem-solving, as an AI engineer."
441
+ ]
442
+ }
443
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
+ def display_glossary_grid(roleplaying_glossary):
446
+ search_urls = {
447
+ "🚀🌌ArXiv": lambda k: f"/?q={quote(k)}",
448
+ "📖": lambda k: f"https://en.wikipedia.org/wiki/{quote(k)}",
449
+ "🔍": lambda k: f"https://www.google.com/search?q={quote(k)}"
450
+ }
451
+ for category, details in roleplaying_glossary.items():
452
+ st.write(f"### {category}")
453
+ cols = st.columns(len(details))
454
+ for idx, (game, terms) in enumerate(details.items()):
455
+ with cols[idx]:
456
+ st.markdown(f"#### {game}")
457
+ for term in terms:
458
+ links_md = ' '.join([f"[{emoji}]({url(term)})" for emoji, url in search_urls.items()])
459
+ st.markdown(f"**{term}** <small>{links_md}</small>", unsafe_allow_html=True)
460
+
461
+ def create_zip_of_files(files):
462
+ zip_name = "assets.zip"
463
+ with zipfile.ZipFile(zip_name, 'w') as zipf:
464
+ for file in files:
465
+ zipf.write(file)
466
+ return zip_name
467
+
468
+ def get_zip_download_link(zip_file):
469
+ with open(zip_file, 'rb') as f:
470
+ data = f.read()
471
+ b64 = base64.b64encode(data).decode()
472
+ return f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
473
+
474
+ def FileSidebar():
475
+ all_files = glob.glob("*.md")
476
+ all_files = [file for file in all_files if len(os.path.splitext(file)[0]) >= 10]
477
+ all_files.sort(key=lambda x: (os.path.splitext(x)[1], x), reverse=True)
478
+ Files1, Files2 = st.sidebar.columns(2)
479
+ with Files1:
480
+ if st.button("🗑 Delete All"):
481
+ for file in all_files:
482
+ os.remove(file)
483
+ st.rerun()
484
+ with Files2:
485
+ if st.button("⬇️ Download"):
486
+ zip_file = create_zip_of_files(all_files)
487
+ st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
488
+ file_contents = ''
489
+ file_name = ''
490
+ next_action = ''
491
+ for file in all_files:
492
+ col1, col2, col3, col4, col5 = st.sidebar.columns([1, 6, 1, 1, 1])
493
+ with col1:
494
+ if st.button("🌐", key=f"md_{file}"):
495
+ with open(file, "r", encoding='utf-8') as f:
496
+ file_contents = f.read()
497
+ file_name = file
498
+ next_action = 'md'
499
+ st.session_state['next_action'] = next_action
500
+ with col2:
501
+ st.markdown(get_download_link(file, "text/markdown", file))
502
+ with col3:
503
+ if st.button("📂", key=f"open_{file}"):
504
+ with open(file, "r", encoding='utf-8') as f:
505
+ file_contents = f.read()
506
+ file_name = file
507
+ next_action = 'open'
508
+ st.session_state['lastfilename'] = file
509
+ st.session_state['filename'] = file
510
+ st.session_state['filetext'] = file_contents
511
+ st.session_state['next_action'] = next_action
512
+ with col4:
513
+ if st.button("▶️", key=f"read_{file}"):
514
+ with open(file, "r", encoding='utf-8') as f:
515
+ file_contents = f.read()
516
+ file_name = file
517
+ next_action = 'search'
518
+ st.session_state['next_action'] = next_action
519
+ with col5:
520
+ if st.button("🗑", key=f"delete_{file}"):
521
+ os.remove(file)
522
+ file_name = file
523
+ st.rerun()
524
+ next_action = 'delete'
525
+ st.session_state['next_action'] = next_action
526
+ if len(file_contents) > 0:
527
+ if next_action == 'open':
528
+ if 'lastfilename' not in st.session_state:
529
+ st.session_state['lastfilename'] = ''
530
+ if 'filename' not in st.session_state:
531
+ st.session_state['filename'] = ''
532
+ if 'filetext' not in st.session_state:
533
+ st.session_state['filetext'] = ''
534
+ open1, open2 = st.columns([.8, .2])
535
+ with open1:
536
+ file_name_input = st.text_input(key='file_name_input', label="File Name:", value=file_name)
537
+ file_content_area = st.text_area(key='file_content_area', label="File Contents:", value=file_contents, height=300)
538
+ if file_name_input != file_name:
539
+ os.rename(file_name, file_name_input)
540
+ st.markdown(f'Renamed file {file_name} to {file_name_input}.')
541
+ if file_content_area != file_contents:
542
+ with open(file_name_input, 'w', encoding='utf-8') as f:
543
+ f.write(file_content_area)
544
+ st.markdown(f'Saved {file_name_input}.')
545
+ if next_action == 'search':
546
+ st.text_area("File Contents:", file_contents, height=500)
547
+ filesearch = "Create a streamlit python user app with full code listing: " + file_contents
548
+ st.markdown(filesearch)
549
+ if st.button(key='rerun', label='🔍Re-Code'):
550
+ result = search_arxiv(filesearch)
551
+ st.markdown(result)
552
+ if next_action == 'md':
553
+ st.markdown(file_contents)
554
+ SpeechSynthesis(file_contents)
555
+
556
+ FileSidebar()
557
+
558
+ # Tabs
559
+ tabs = st.tabs(["Camera 📷", "Download 📥", "OCR 🔍", "Build 🌱", "Image Gen 🎨", "PDF 📄", "Image 🖼️", "Audio 🎵", "Video 🎥", "Code 🧑‍💻", "Gallery 📚", "Search 🔎"])
560
+ (tab_camera, tab_download, tab_ocr, tab_build, tab_imggen, tab_pdf, tab_image, tab_audio, tab_video, tab_code, tab_gallery, tab_search) = tabs
561
+
562
+ with tab_camera:
563
+ st.header("Camera Snap 📷")
564
+ cols = st.columns(2)
565
+ for i, cam_key in enumerate(["cam0", "cam1"]):
566
+ with cols[i]:
567
+ cam_img = st.camera_input(f"Take a picture - Cam {i}", key=cam_key)
568
+ if cam_img:
569
+ filename = generate_filename(f"cam{i}", "png")
570
+ if st.session_state[f'cam{i}_file'] and os.path.exists(st.session_state[f'cam{i}_file']):
571
+ os.remove(st.session_state[f'cam{i}_file'])
572
+ with open(filename, "wb") as f:
573
+ f.write(cam_img.getvalue())
574
+ st.session_state[f'cam{i}_file'] = filename
575
+ st.session_state['history'].append(f"Snapshot from Cam {i}: {filename}")
576
+ st.image(Image.open(filename), caption=f"Camera {i}", use_container_width=True)
577
+
578
+ with tab_download:
579
+ st.header("Download PDFs 📥")
580
+ if st.button("Examples 📚"):
581
+ example_urls = ["https://arxiv.org/pdf/2308.03892", "https://arxiv.org/pdf/1912.01703"]
582
+ st.session_state['pdf_urls'] = "\n".join(example_urls)
583
+ url_input = st.text_area("Enter PDF URLs (one per line)", value=st.session_state.get('pdf_urls', ""), height=200)
584
+ if st.button("Robo-Download 🤖"):
585
+ urls = url_input.strip().split("\n")
586
+ progress_bar = st.progress(0)
587
+ for idx, url in enumerate(urls):
588
+ if url:
589
+ output_path = pdf_url_to_filename(url)
590
+ if download_pdf(url, output_path):
591
+ st.session_state['downloaded_pdfs'][url] = output_path
592
+ st.session_state['history'].append(f"Downloaded PDF: {output_path}")
593
+ st.session_state['asset_checkboxes'][output_path] = True
594
+ progress_bar.progress((idx + 1) / len(urls))
595
+
596
+ with tab_ocr:
597
+ st.header("Test OCR 🔍")
598
  all_files = get_gallery_files()
599
  if all_files:
600
+ if st.button("OCR All Assets 🚀"):
601
+ full_text = "# OCR Results\n\n"
602
+ for file in all_files:
 
603
  if file.endswith('.png'):
604
+ image = Image.open(file)
605
  else:
606
+ doc = fitz.open(file)
607
+ pix = doc[0].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
608
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
609
+ doc.close()
610
+ output_file = generate_filename(f"ocr_{os.path.basename(file)}", "txt")
611
+ result = asyncio.run(process_ocr(image, output_file))
612
+ full_text += f"## {os.path.basename(file)}\n\n{result}\n\n"
613
+ st.session_state['history'].append(f"OCR Test: {file} -> {output_file}")
614
+ md_output_file = generate_filename("full_ocr", "md")
615
+ with open(md_output_file, "w") as f:
616
+ f.write(full_text)
617
+ st.success(f"Full OCR saved to {md_output_file}")
618
+ st.markdown(get_download_link(md_output_file, "text/markdown", "Download Full OCR Markdown"), unsafe_allow_html=True)
619
+ selected_file = st.selectbox("Select Image or PDF", all_files, key="ocr_select")
620
+ if selected_file:
621
+ if selected_file.endswith('.png'):
622
+ image = Image.open(selected_file)
623
+ else:
624
+ doc = fitz.open(selected_file)
625
+ pix = doc[0].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
626
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
627
+ doc.close()
628
+ st.image(image, caption="Input Image", use_container_width=True)
629
+ if st.button("Run OCR 🚀", key="ocr_run"):
630
+ output_file = generate_filename("ocr_output", "txt")
631
+ result = asyncio.run(process_ocr(image, output_file))
632
+ st.text_area("OCR Result", result, height=200)
633
+ st.session_state['history'].append(f"OCR Test: {selected_file} -> {output_file}")
634
+
635
+ with tab_build:
636
+ st.header("Build Titan 🌱")
637
+ model_type = st.selectbox("Model Type", ["Causal LM", "Diffusion"], key="build_type")
638
+ base_model = st.selectbox("Select Model", ["HuggingFaceTB/SmolLM-135M", "Qwen/Qwen1.5-0.5B-Chat"] if model_type == "Causal LM" else ["OFA-Sys/small-stable-diffusion-v0", "stabilityai/stable-diffusion-2-base"])
639
+ model_name = st.text_input("Model Name", f"tiny-titan-{int(time.time())}")
640
+ if st.button("Download Model ⬇️"):
641
+ config = (ModelConfig if model_type == "Causal LM" else DiffusionConfig)(name=model_name, base_model=base_model, size="small")
642
+ builder = ModelBuilder() if model_type == "Causal LM" else DiffusionBuilder()
643
+ builder.load_model(base_model, config)
644
+ builder.save_model(config.model_path)
645
+ st.session_state['builder'] = builder
646
+ st.session_state['model_loaded'] = True
647
+
648
+ with tab_imggen:
649
+ st.header("Test Image Gen 🎨")
650
+ prompt = st.text_area("Prompt", "Generate a futuristic cityscape")
651
+ if st.button("Run Image Gen 🚀"):
652
+ output_file = generate_filename("gen_output", "png", prompt=prompt)
653
+ result = asyncio.run(process_image_gen(prompt, output_file))
654
+ st.image(result, caption="Generated Image", use_container_width=True)
655
+ st.session_state['history'].append(f"Image Gen Test: {prompt} -> {output_file}")
656
+
657
+ with tab_pdf:
658
+ st.header("PDF Process 📄")
659
+ uploaded_pdfs = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
660
+ view_mode = st.selectbox("View Mode", ["Single Page", "Two Pages"], key="pdf_view_mode")
661
+ if st.button("Process PDFs"):
662
+ for pdf_file in uploaded_pdfs:
663
+ pdf_path = generate_filename(pdf_file.name, "pdf")
664
+ with open(pdf_path, "wb") as f:
665
+ f.write(pdf_file.read())
666
+ snapshots = asyncio.run(process_pdf_snapshot(pdf_path, "twopage" if view_mode == "Two Pages" else "single"))
667
+ for snapshot in snapshots:
668
+ st.image(Image.open(snapshot), caption=snapshot)
669
+ text = process_image_with_prompt(Image.open(snapshot), "Extract the electronic text from image")
670
+ st.text_area(f"Extracted Text from {snapshot}", text)
671
+ code_prompt = f"Generate Python code based on this text:\n\n{text}"
672
+ code = process_text_with_prompt(text, code_prompt)
673
+ st.code(code, language="python")
674
+ if st.button(f"Execute Code from {snapshot}"):
675
+ output, error = execute_code(code)
676
+ if error:
677
+ st.error(f"Error: {error}")
678
+ else:
679
+ st.success(f"Output: {output or 'No output'}")
680
+
681
+ with tab_image:
682
+ st.header("Image Process 🖼️")
683
+ uploaded_images = st.file_uploader("Upload Images", type=["png", "jpg"], accept_multiple_files=True)
684
+ prompt = st.text_input("Prompt", "Extract the electronic text from image")
685
+ if st.button("Process Images"):
686
+ for img_file in uploaded_images:
687
+ img = Image.open(img_file)
688
+ st.image(img, caption=img_file.name)
689
+ result = process_image_with_prompt(img, prompt)
690
+ st.text_area(f"Result for {img_file.name}", result)
691
+
692
+ with tab_audio:
693
+ st.header("Audio Process 🎵")
694
+ audio_bytes = audio_recorder()
695
+ if audio_bytes:
696
+ filename = generate_filename("recording", "wav")
697
+ with open(filename, "wb") as f:
698
+ f.write(audio_bytes)
699
+ st.audio(filename)
700
+ process_audio(filename)
701
+
702
+ with tab_video:
703
+ st.header("Video Process 🎥")
704
+ video_input = st.file_uploader("Upload Video", type=["mp4"])
705
+ if video_input:
706
+ video_path = generate_filename(video_input.name, "mp4")
707
+ with open(video_path, "wb") as f:
708
+ f.write(video_input.read())
709
+ st.video(video_path)
710
+ result = process_video(video_path, "Summarize this video in markdown")
711
+ st.markdown(result)
712
+
713
+ with tab_code:
714
+ st.header("Code Executor 🧑‍💻")
715
+ uploaded_file = st.file_uploader("📤 Upload a Python (.py) or Markdown (.md) file", type=['py', 'md'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  if 'code' not in st.session_state:
717
+ st.session_state.code = '''import streamlit as st\nst.write("Hello, World!")'''
718
+ if uploaded_file:
 
 
 
719
  content = uploaded_file.getvalue().decode()
720
  if uploaded_file.type == "text/markdown":
721
  code_blocks = extract_python_code(content)
722
+ code_input = code_blocks[0] if code_blocks else ""
 
 
 
 
723
  else:
724
  code_input = content
725
+ else:
726
+ code_input = st.text_area("Python Code", value=st.session_state.code, height=400)
727
+ col1, col2 = st.columns([1, 1])
728
  with col1:
729
  if st.button("▶️ Run Code"):
730
+ output, error = execute_code(code_input)
731
+ if error:
732
+ st.error(f"Error: {error}")
 
 
 
 
 
733
  else:
734
+ st.success(f"Output: {output or 'No output'}")
735
  with col2:
736
  if st.button("🗑️ Clear Code"):
737
  st.session_state.code = ""
738
+ st.rerun()
739
+
740
+ with tab_gallery:
741
+ st.header("Gallery 📚")
742
+ all_files = get_gallery_files()
743
+ for file in all_files:
744
+ if file.endswith('.png'):
745
+ st.image(Image.open(file), caption=file)
746
+ elif file.endswith('.pdf'):
747
+ doc = fitz.open(file)
748
+ pix = doc[0].get_pixmap(matrix=fitz.Matrix(0.5, 0.5))
749
+ st.image(Image.frombytes("RGB", [pix.width, pix.height], pix.samples), caption=file)
750
+ doc.close()
751
+ elif file.endswith('.md'):
752
+ with open(file, "r") as f:
753
+ st.markdown(f.read())
754
+ elif file.endswith('.wav'):
755
+ st.audio(file)
756
+ elif file.endswith('.mp4'):
757
+ st.video(file)
758
+
759
+ with tab_search:
760
+ st.header("ArXiv Search 🔎")
761
+ query = st.text_input("Search ArXiv", "")
762
+ if query:
763
+ result = search_arxiv(query)
764
+ st.markdown(result)
765
+
766
+ # Sidebar
767
+ st.sidebar.subheader("Gallery Settings")
768
+ st.session_state['gallery_size'] = st.sidebar.slider("Gallery Size", 1, 10, st.session_state['gallery_size'], key="gallery_size_slider")
769
+ st.sidebar.subheader("Action Logs 📜")
770
+ for record in log_records:
771
+ st.sidebar.write(f"{record.asctime} - {record.levelname} - {record.message}")
772
+ st.sidebar.subheader("History 📜")
773
+ for entry in st.session_state.get("history", []):
774
+ if entry:
775
+ st.sidebar.write(entry)
776
+
777
+ def update_gallery():
778
+ container = st.session_state['asset_gallery_container']
779
+ container.empty()
780
+ all_files = get_gallery_files()
781
+ if all_files:
782
+ container.markdown("### Asset Gallery 📸📖")
783
+ cols = container.columns(2)
784
+ for idx, file in enumerate(all_files[:st.session_state['gallery_size']]):
785
+ with cols[idx % 2]:
786
+ if file.endswith('.png'):
787
+ st.image(Image.open(file), caption=os.path.basename(file))
788
+ elif file.endswith('.pdf'):
789
+ doc = fitz.open(file)
790
+ pix = doc[0].get_pixmap(matrix=fitz.Matrix(0.5, 0.5))
791
+ st.image(Image.frombytes("RGB", [pix.width, pix.height], pix.samples), caption=os.path.basename(file))
792
+ doc.close()
793
+ st.checkbox("Select", key=f"asset_{file}", value=st.session_state['asset_checkboxes'].get(file, False))
794
+ st.markdown(get_download_link(file, "application/octet-stream", "Download"), unsafe_allow_html=True)
795
+ if st.button("Delete", key=f"delete_{file}"):
796
+ os.remove(file)
797
+ st.session_state['asset_checkboxes'].pop(file, None)
798
+ st.experimental_rerun()
799
+
800
+ update_gallery()
801
+
802
+ # Chatbot
803
+ if prompt := st.chat_input("GPT-4o Multimodal ChatBot - What can I help you with?"):
804
  st.session_state.messages.append({"role": "user", "content": prompt})
805
  with st.chat_message("user"):
806
  st.markdown(prompt)
807
  with st.chat_message("assistant"):
808
+ completion = client.chat.completions.create(model=st.session_state["openai_model"], messages=st.session_state.messages, stream=True)
809
+ response = ""
810
+ for chunk in completion:
811
+ if chunk.choices[0].delta.content:
812
+ response += chunk.choices[0].delta.content
813
+ st.write(response)
814
+ st.session_state.messages.append({"role": "assistant", "content": response})