awacke1 commited on
Commit
de093f2
·
verified ·
1 Parent(s): cf22379

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +532 -506
app.py CHANGED
@@ -1,14 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import aiofiles
2
  import asyncio
3
  import base64
4
- import cv2
5
  import fitz
6
  import glob
7
- import io
8
- import json
9
  import logging
10
- import math
11
- import mistune
12
  import os
13
  import pandas as pd
14
  import pytz
@@ -17,71 +28,123 @@ import re
17
  import requests
18
  import shutil
19
  import streamlit as st
20
- import streamlit.components.v1 as components
21
- import sys
22
- import textract
23
  import time
24
- import tiktoken
25
  import torch
26
  import zipfile
27
 
28
- from audio_recorder_streamlit import audio_recorder
29
- from bs4 import BeautifulSoup
30
- from collections import deque
31
- from contextlib import redirect_stdout
32
  from dataclasses import dataclass
33
  from datetime import datetime
34
  from diffusers import StableDiffusionPipeline
35
- from gradio_client import Client, handle_file
36
- from huggingface_hub import InferenceClient
37
  from io import BytesIO
38
- from moviepy import VideoFileClip
39
  from openai import OpenAI
40
  from PIL import Image
41
- from PyPDF2 import PdfReader
42
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
43
  from typing import Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  from urllib.parse import quote
45
  from xml.etree import ElementTree as ET
 
46
 
47
- # Initialize OpenAI client
48
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
 
 
 
49
 
50
- # Logging setup
51
- logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
52
- logger = logging.getLogger(__name__)
53
- log_records = []
54
- class LogCaptureHandler(logging.Handler):
55
- def emit(self, record):
56
- log_records.append(record)
57
- logger.addHandler(LogCaptureHandler())
58
 
59
- # Streamlit configuration
60
  st.set_page_config(
61
- page_title="AI Multimodal Titan 🚀",
62
  page_icon="🤖",
63
  layout="wide",
64
  initial_sidebar_state="expanded",
65
  menu_items={
66
  'Get Help': 'https://huggingface.co/awacke1',
67
  'Report a Bug': 'https://huggingface.co/spaces/awacke1',
68
- 'About': "AI Multimodal Titan: PDFs, OCR, Image Gen, Audio/Video Processing, Code Execution, and More! 🌌"
69
  }
70
  )
71
 
72
- # Session state initialization
73
- for key in ['history', 'messages', 'processing', 'asset_checkboxes', 'downloaded_pdfs', 'unique_counter', 'search_queries']:
74
- st.session_state.setdefault(key, [] if key in ['history', 'messages', 'search_queries'] else {} if key in ['asset_checkboxes', 'downloaded_pdfs', 'processing'] else 0 if key == 'unique_counter' else None)
75
- st.session_state.setdefault('builder', None)
76
- st.session_state.setdefault('model_loaded', False)
77
- st.session_state.setdefault('selected_model_type', "Causal LM")
78
- st.session_state.setdefault('selected_model', "None")
79
- st.session_state.setdefault('gallery_size', 2)
80
- st.session_state.setdefault('asset_gallery_container', st.sidebar.empty())
81
- st.session_state.setdefault('cam0_file', None)
82
- st.session_state.setdefault('cam1_file', None)
83
-
84
- # Model configurations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  @dataclass
86
  class ModelConfig:
87
  name: str
@@ -90,7 +153,7 @@ class ModelConfig:
90
  domain: Optional[str] = None
91
  model_type: str = "causal_lm"
92
  @property
93
- def model_path(self):
94
  return f"models/{self.name}"
95
 
96
  @dataclass
@@ -114,128 +177,56 @@ class ModelBuilder:
114
  "I told my neural network a joke; it couldn't stop dropping bits! 🤖"
115
  ]
116
  def load_model(self, model_path: str, config: Optional[ModelConfig] = None):
117
- with st.spinner(f"Loading {model_path}..."):
118
  self.model = AutoModelForCausalLM.from_pretrained(model_path)
119
  self.tokenizer = AutoTokenizer.from_pretrained(model_path)
120
  if self.tokenizer.pad_token is None:
121
  self.tokenizer.pad_token = self.tokenizer.eos_token
122
  if config:
123
  self.config = config
124
- self.model.to("cuda" if torch.cuda.is_available() else "cpu")
125
- st.success(f"Model loaded! 🎉 {random.choice(self.jokes)}")
 
126
  return self
127
  def save_model(self, path: str):
128
- with st.spinner("Saving model... 💾"):
129
  os.makedirs(os.path.dirname(path), exist_ok=True)
130
  self.model.save_pretrained(path)
131
  self.tokenizer.save_pretrained(path)
132
- st.success(f"Model saved at {path}!")
133
-
134
  class DiffusionBuilder:
135
  def __init__(self):
136
  self.config = None
137
  self.pipeline = None
138
  def load_model(self, model_path: str, config: Optional[DiffusionConfig] = None):
139
- with st.spinner(f"Loading diffusion model {model_path}..."):
140
  self.pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float32).to("cpu")
141
  if config:
142
  self.config = config
143
- st.success("Diffusion model loaded! 🎨")
144
  return self
145
  def save_model(self, path: str):
146
- with st.spinner("Saving diffusion model... 💾"):
147
  os.makedirs(os.path.dirname(path), exist_ok=True)
148
  self.pipeline.save_pretrained(path)
149
- st.success(f"Diffusion model saved at {path}!")
150
  def generate(self, prompt: str):
151
  return self.pipeline(prompt, num_inference_steps=20).images[0]
152
 
153
- # Utility functions
154
- def generate_filename(prompt, ext="png"):
155
- central = pytz.timezone('US/Central')
156
- safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
157
- safe_prompt = re.sub(r'[<>:"/\\|?*\n]', '_', prompt)[:240]
158
- return f"{safe_date_time}_{safe_prompt}.{ext}"
159
-
160
- def get_download_link(file_path, mime_type="application/pdf", label="Download"):
161
- with open(file_path, "rb") as f:
162
- data = base64.b64encode(f.read()).decode()
163
- return f'<a href="data:{mime_type};base64,{data}" download="{os.path.basename(file_path)}">{label}</a>'
164
-
165
- def zip_directory(directory_path, zip_path):
166
- with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
167
- for root, _, files in os.walk(directory_path):
168
- for file in files:
169
- zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.dirname(directory_path)))
170
-
171
- def get_gallery_files(file_types=["png", "pdf", "md", "wav", "mp4"]):
172
- return sorted(list({f for ext in file_types for f in glob.glob(f"*.{ext}")}))
173
-
174
- def download_pdf(url, output_path):
175
- try:
176
- response = requests.get(url, stream=True, timeout=10)
177
- if response.status_code == 200:
178
- with open(output_path, "wb") as f:
179
- for chunk in response.iter_content(chunk_size=8192):
180
- f.write(chunk)
181
- return True
182
- except requests.RequestException as e:
183
- logger.error(f"Failed to download {url}: {e}")
184
- return False
185
-
186
- # Processing functions
187
- async def process_pdf_snapshot(pdf_path, mode="single"):
188
- start_time = time.time()
189
- status = st.empty()
190
- status.text(f"Processing PDF Snapshot ({mode})... (0s)")
191
- try:
192
- doc = fitz.open(pdf_path)
193
- output_files = []
194
- if mode == "single":
195
- page = doc[0]
196
- pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
197
- output_file = generate_filename("single", "png")
198
- pix.save(output_file)
199
- output_files.append(output_file)
200
- elif mode == "double":
201
- if len(doc) >= 2:
202
- pix1 = doc[0].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
203
- pix2 = doc[1].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
204
- img1 = Image.frombytes("RGB", [pix1.width, pix1.height], pix1.samples)
205
- img2 = Image.frombytes("RGB", [pix2.width, pix2.height], pix2.samples)
206
- combined_img = Image.new("RGB", (pix1.width + pix2.width, max(pix1.height, pix2.height)))
207
- combined_img.paste(img1, (0, 0))
208
- combined_img.paste(img2, (pix1.width, 0))
209
- output_file = generate_filename("double", "png")
210
- combined_img.save(output_file)
211
- output_files.append(output_file)
212
- elif mode == "allpages":
213
- for i in range(len(doc)):
214
- page = doc[i]
215
- pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
216
- output_file = generate_filename(f"page_{i}", "png")
217
- pix.save(output_file)
218
- output_files.append(output_file)
219
- doc.close()
220
- elapsed = int(time.time() - start_time)
221
- status.text(f"PDF Snapshot ({mode}) completed in {elapsed}s!")
222
- return output_files
223
- except Exception as e:
224
- status.error(f"Failed to process PDF: {str(e)}")
225
- return []
226
-
227
  async def process_ocr(image, output_file):
228
  start_time = time.time()
229
  status = st.empty()
230
- status.text("Processing GOT-OCR2_0... (0s)")
231
  tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True)
232
  model = AutoModel.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True, torch_dtype=torch.float32).to("cpu").eval()
233
- temp_file = generate_filename("temp", "png")
234
  image.save(temp_file)
235
  result = model.chat(tokenizer, temp_file, ocr_type='ocr')
236
  os.remove(temp_file)
237
  elapsed = int(time.time() - start_time)
238
- status.text(f"GOT-OCR2_0 completed in {elapsed}s!")
239
  async with aiofiles.open(output_file, "w") as f:
240
  await f.write(result)
241
  return result
@@ -243,11 +234,15 @@ async def process_ocr(image, output_file):
243
  async def process_image_gen(prompt, output_file):
244
  start_time = time.time()
245
  status = st.empty()
246
- status.text("Processing Image Gen... (0s)")
247
- pipeline = st.session_state['builder'].pipeline if st.session_state.get('builder') and isinstance(st.session_state['builder'], DiffusionBuilder) else StableDiffusionPipeline.from_pretrained("OFA-Sys/small-stable-diffusion-v0", torch_dtype=torch.float32).to("cpu")
 
 
 
 
248
  gen_image = pipeline(prompt, num_inference_steps=20).images[0]
249
  elapsed = int(time.time() - start_time)
250
- status.text(f"Image Gen completed in {elapsed}s!")
251
  gen_image.save(output_file)
252
  return gen_image
253
 
@@ -255,85 +250,97 @@ def process_image_with_prompt(image, prompt, model="gpt-4o-mini", detail="auto")
255
  buffered = BytesIO()
256
  image.save(buffered, format="PNG")
257
  img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
258
- messages = [{"role": "user", "content": [{"type": "text", "text": prompt}, {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_str}", "detail": detail}}]}]
 
 
 
 
 
 
 
259
  try:
260
  response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
261
  return response.choices[0].message.content
262
  except Exception as e:
263
- return f"Error processing image with GPT: {str(e)}"
264
 
265
  def process_text_with_prompt(text, prompt, model="gpt-4o-mini"):
266
  messages = [{"role": "user", "content": f"{prompt}\n\n{text}"}]
 
267
  try:
268
  response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
269
  return response.choices[0].message.content
270
  except Exception as e:
271
- return f"Error processing text with GPT: {str(e)}"
272
-
273
- def process_audio(audio_input, text_input=''):
274
- with open(audio_input, "rb") as file:
275
- transcription = client.audio.transcriptions.create(model="whisper-1", file=file)
276
- st.session_state.messages.append({"role": "user", "content": transcription.text})
277
- completion = client.chat.completions.create(model="gpt-4o-2024-05-13", messages=[{"role": "user", "content": f"{text_input}\n\n{transcription.text}"}])
278
- return_text = completion.choices[0].message.content
279
- filename = generate_filename(transcription.text, "md")
280
- with open(filename, "w", encoding="utf-8") as f:
281
- f.write(text_input + "\n\n" + return_text)
282
- st.session_state.messages.append({"role": "assistant", "content": return_text})
283
- return transcription.text, return_text
284
-
285
- def process_video(video_path, prompt):
286
- base64Frames, audio_path = process_video_frames(video_path)
287
- with open(video_path, "rb") as file:
288
- transcription = client.audio.transcriptions.create(model="whisper-1", file=file)
289
- messages = [{"role": "user", "content": ["These are the frames from the video.", *map(lambda x: {"type": "image_url", "image_url": {"url": f'data:image/jpg;base64,{x}', "detail": "low"}}, base64Frames), {"type": "text", "text": f"The audio transcription is: {transcription.text}\n\n{prompt}"}]}]
290
- response = client.chat.completions.create(model="gpt-4o-2024-05-13", messages=messages)
291
- return response.choices[0].message.content
292
-
293
- def process_video_frames(video_path, seconds_per_frame=2):
294
- base64Frames = []
295
- base_video_path, _ = os.path.splitext(video_path)
296
- video = cv2.VideoCapture(video_path)
297
- total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
298
- fps = video.get(cv2.CAP_PROP_FPS)
299
- frames_to_skip = int(fps * seconds_per_frame)
300
- curr_frame = 0
301
- while curr_frame < total_frames - 1:
302
- video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
303
- success, frame = video.read()
304
- if not success:
305
- break
306
- _, buffer = cv2.imencode(".jpg", frame)
307
- base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
308
- curr_frame += frames_to_skip
309
- video.release()
310
- audio_path = f"{base_video_path}.mp3"
311
- try:
312
- clip = VideoFileClip(video_path)
313
- clip.audio.write_audiofile(audio_path, bitrate="32k")
314
- clip.audio.close()
315
- clip.close()
316
- except:
317
- logger.info("No audio track found in video.")
318
- return base64Frames, audio_path
319
 
320
- def execute_code(code):
321
- buffer = io.StringIO()
 
 
 
322
  try:
323
- with redirect_stdout(buffer):
324
- exec(code, {}, {})
325
- return buffer.getvalue(), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  except Exception as e:
327
- return None, str(e)
328
- finally:
329
- buffer.close()
330
-
331
- def extract_python_code(markdown_text):
332
- pattern = r"```python\s*(.*?)\s*```"
333
- matches = re.findall(pattern, markdown_text, re.DOTALL)
334
- return matches
335
 
336
- # Speech synthesis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
337
  def SpeechSynthesis(result):
338
  documentHTML5 = f'''
339
  <!DOCTYPE html>
@@ -358,333 +365,352 @@ def SpeechSynthesis(result):
358
  '''
359
  components.html(documentHTML5, width=1280, height=300)
360
 
361
- # ArXiv search
362
- def search_arxiv(query):
363
- start_time = time.strftime("%Y-%m-%d %H:%M:%S")
364
- client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
365
- response1 = client.predict(message="Hello!!", llm_results_use=5, database_choice="Semantic Search", llm_model_picked="mistralai/Mistral-7B-Instruct-v0.2", api_name="/update_with_rag_md")
366
- Question = f'### 🔎 {query}\r\n'
367
- References = response1[0]
368
- References2 = response1[1]
369
- filename = generate_filename(query, "md")
370
- with open(filename, "w", encoding="utf-8") as f:
371
- f.write(Question + References + References2)
372
- st.session_state.messages.append({"role": "assistant", "content": References + References2})
373
- response2 = client.predict(query, "mistralai/Mixtral-8x7B-Instruct-v0.1", True, api_name="/ask_llm")
374
- if len(response2) > 10:
375
- Answer = response2
376
- SpeechSynthesis(Answer)
377
- results = Question + '\r\n' + Answer + '\r\n' + References + '\r\n' + References2
378
- return results
379
- return References + References2
380
-
381
- # Glossary data
382
- roleplaying_glossary = {
383
- "🤖 AI Concepts": {
384
- "MoE (Mixture of Experts) 🧠": [
385
- "As a leading AI health researcher, provide an overview of MoE, MAS, memory, and mirroring in healthcare applications.",
386
- "Explain how MoE and MAS can be leveraged to create AGI and AMI systems for healthcare, as an AI architect."
387
- ],
388
- "Multi Agent Systems (MAS) 🤝": [
389
- "As a renowned MAS researcher, describe the key characteristics of distributed, autonomous, and cooperative MAS.",
390
- "Discuss how MAS is applied in robotics, simulations, and decentralized problem-solving, as an AI engineer."
391
- ]
392
- },
393
- "🛠️ AI Tools & Platforms": {
394
- "ChatDev 💬": [
395
- "As a chatbot developer, ask about the features and capabilities ChatDev offers for building conversational AI.",
396
- "Inquire about the pre-built assets, integrations, and multi-platform support in ChatDev, as a product manager."
397
- ]
398
- }
399
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
- def display_glossary_grid(roleplaying_glossary):
402
- search_urls = {
403
- "🚀🌌ArXiv": lambda k: f"/?q={quote(k)}",
404
- "📖": lambda k: f"https://en.wikipedia.org/wiki/{quote(k)}",
405
- "🔍": lambda k: f"https://www.google.com/search?q={quote(k)}"
406
- }
407
- for category, details in roleplaying_glossary.items():
408
- st.write(f"### {category}")
409
- cols = st.columns(len(details))
410
- for idx, (game, terms) in enumerate(details.items()):
411
- with cols[idx]:
412
- st.markdown(f"#### {game}")
413
- for term in terms:
414
- links_md = ' '.join([f"[{emoji}]({url(term)})" for emoji, url in search_urls.items()])
415
- st.markdown(f"**{term}** <small>{links_md}</small>", unsafe_allow_html=True)
416
-
417
- # File operations
418
- def create_zip_of_files(files):
419
- zip_name = "assets.zip"
420
- with zipfile.ZipFile(zip_name, 'w') as zipf:
421
- for file in files:
422
- zipf.write(file)
423
- return zip_name
424
-
425
- def get_zip_download_link(zip_file):
426
- with open(zip_file, 'rb') as f:
427
- data = f.read()
428
- b64 = base64.b64encode(data).decode()
429
- return f'<a href="data:application/zip;base64,{b64}" download="{zip_file}">Download All</a>'
430
-
431
- # Sidebar
432
- st.sidebar.subheader("Gallery Settings")
433
- st.session_state['gallery_size'] = st.sidebar.slider("Gallery Size", 1, 10, st.session_state['gallery_size'], key="gallery_size_slider")
434
-
435
- # File sidebar
436
- def FileSidebar():
437
- all_files = glob.glob("*.md")
438
- all_files = [file for file in all_files if len(os.path.splitext(file)[0]) >= 10]
439
- all_files.sort(key=lambda x: (os.path.splitext(x)[1], x), reverse=True)
440
- Files1, Files2 = st.sidebar.columns(2)
441
- with Files1:
442
- if st.button("🗑 Delete All"):
443
- for file in all_files:
444
- os.remove(file)
445
- st.rerun()
446
- with Files2:
447
- if st.button("⬇️ Download"):
448
- zip_file = create_zip_of_files(all_files)
449
- st.sidebar.markdown(get_zip_download_link(zip_file), unsafe_allow_html=True)
450
- for file in all_files:
451
- col1, col2, col3, col4 = st.sidebar.columns([1, 6, 1, 1])
452
- with col1:
453
- if st.button("🌐", key=f"md_{file}"):
454
- with open(file, "r", encoding='utf-8') as f:
455
- st.markdown(f.read())
456
- with col2:
457
- st.markdown(get_download_link(file, "text/markdown", file))
458
- with col3:
459
- if st.button("📂", key=f"open_{file}"):
460
- with open(file, "r", encoding='utf-8') as f:
461
- st.text_area("File Contents", f.read(), height=300)
462
- with col4:
463
- if st.button("🗑", key=f"delete_{file}"):
464
- os.remove(file)
465
- st.rerun()
466
-
467
- FileSidebar()
468
-
469
- # Tabs
470
- tabs = st.tabs(["Camera 📷", "Download 📥", "OCR 🔍", "Build 🌱", "Image Gen 🎨", "PDF 📄", "Image 🖼️", "Audio 🎵", "Video 🎥", "Code 🧑‍💻", "Gallery 📚", "Search 🔎", "Glossary 📖"])
471
- (tab_camera, tab_download, tab_ocr, tab_build, tab_imggen, tab_pdf, tab_image, tab_audio, tab_video, tab_code, tab_gallery, tab_search, tab_glossary) = tabs
472
-
473
- with tab_camera:
474
- st.header("Camera Snap 📷")
475
- cols = st.columns(2)
476
- for i, cam_key in enumerate(["cam0", "cam1"]):
477
- with cols[i]:
478
- cam_img = st.camera_input(f"Take a picture - Cam {i}", key=cam_key)
479
- if cam_img:
480
- filename = generate_filename(f"cam{i}")
481
- with open(filename, "wb") as f:
482
- f.write(cam_img.getvalue())
483
- st.session_state[f'cam{i}_file'] = filename
484
- st.session_state['history'].append(f"Snapshot from Cam {i}: {filename}")
485
- st.image(Image.open(filename), caption=f"Camera {i}", use_container_width=True)
486
-
487
- with tab_download:
488
- st.header("Download PDFs 📥")
489
- if st.button("Examples 📚"):
490
- example_urls = ["https://arxiv.org/pdf/2308.03892", "https://arxiv.org/pdf/1912.01703"]
491
- st.session_state['pdf_urls'] = "\n".join(example_urls)
492
- url_input = st.text_area("Enter PDF URLs (one per line)", value=st.session_state.get('pdf_urls', ""), height=200)
493
- if st.button("Robo-Download 🤖"):
494
- urls = url_input.strip().split("\n")
495
- progress_bar = st.progress(0)
496
- for idx, url in enumerate(urls):
497
- if url:
498
- output_path = generate_filename(url, "pdf")
499
- if download_pdf(url, output_path):
500
- st.session_state['downloaded_pdfs'][url] = output_path
501
- st.session_state['history'].append(f"Downloaded PDF: {output_path}")
502
- st.session_state['asset_checkboxes'][output_path] = True
503
- progress_bar.progress((idx + 1) / len(urls))
504
-
505
- with tab_ocr:
506
- st.header("Test OCR 🔍")
507
- all_files = get_gallery_files()
508
- if all_files:
509
- selected_file = st.selectbox("Select File", all_files, key="ocr_select")
510
- if selected_file and st.button("Run OCR 🚀"):
511
- if selected_file.endswith('.png'):
512
- image = Image.open(selected_file)
513
- else:
514
- doc = fitz.open(selected_file)
515
- pix = doc[0].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
516
- image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
517
- doc.close()
518
- output_file = generate_filename("ocr_output", "txt")
519
- result = asyncio.run(process_ocr(image, output_file))
520
- st.text_area("OCR Result", result, height=200)
521
- st.session_state['history'].append(f"OCR Test: {selected_file} -> {output_file}")
522
-
523
- with tab_build:
524
- st.header("Build Titan 🌱")
525
- model_type = st.selectbox("Model Type", ["Causal LM", "Diffusion"], key="build_type")
526
- base_model = st.selectbox("Select Model", ["HuggingFaceTB/SmolLM-135M", "Qwen/Qwen1.5-0.5B-Chat"] if model_type == "Causal LM" else ["OFA-Sys/small-stable-diffusion-v0", "stabilityai/stable-diffusion-2-base"])
527
- model_name = st.text_input("Model Name", f"tiny-titan-{int(time.time())}")
528
- if st.button("Download Model ⬇️"):
529
- config = (ModelConfig if model_type == "Causal LM" else DiffusionConfig)(name=model_name, base_model=base_model, size="small")
530
- builder = ModelBuilder() if model_type == "Causal LM" else DiffusionBuilder()
531
- builder.load_model(base_model, config)
532
- builder.save_model(config.model_path)
533
- st.session_state['builder'] = builder
534
- st.session_state['model_loaded'] = True
535
-
536
- with tab_imggen:
537
- st.header("Test Image Gen 🎨")
538
- prompt = st.text_area("Prompt", "Generate a futuristic cityscape")
539
- if st.button("Run Image Gen 🚀"):
540
- output_file = generate_filename("gen_output", "png")
541
- result = asyncio.run(process_image_gen(prompt, output_file))
542
- st.image(result, caption="Generated Image", use_container_width=True)
543
- st.session_state['history'].append(f"Image Gen Test: {prompt} -> {output_file}")
544
-
545
- with tab_pdf:
546
- st.header("PDF Process 📄")
547
- uploaded_pdfs = st.file_uploader("Upload PDFs", type=["pdf"], accept_multiple_files=True)
548
- view_mode = st.selectbox("View Mode", ["Single Page", "Double Page"], key="pdf_view_mode")
549
- if st.button("Process PDFs"):
550
- for pdf_file in uploaded_pdfs:
551
- pdf_path = generate_filename(pdf_file.name, "pdf")
552
- with open(pdf_path, "wb") as f:
553
- f.write(pdf_file.read())
554
- snapshots = asyncio.run(process_pdf_snapshot(pdf_path, "double" if view_mode == "Double Page" else "single"))
555
  for snapshot in snapshots:
556
- st.image(Image.open(snapshot), caption=snapshot)
557
- text = process_image_with_prompt(Image.open(snapshot), "Extract the electronic text from image")
558
- st.text_area(f"Extracted Text from {snapshot}", text)
559
- code_prompt = f"Generate Python code based on this text:\n\n{text}"
560
- code = process_text_with_prompt(text, code_prompt)
561
- st.code(code, language="python")
562
- if st.button(f"Execute Code from {snapshot}"):
563
- output, error = execute_code(code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  if error:
565
- st.error(f"Error: {error}")
566
  else:
567
- st.success(f"Output: {output or 'No output'}")
568
-
569
- with tab_image:
570
- st.header("Image Process 🖼️")
571
- uploaded_images = st.file_uploader("Upload Images", type=["png", "jpg"], accept_multiple_files=True)
572
- prompt = st.text_input("Prompt", "Extract the electronic text from image")
573
- if st.button("Process Images"):
574
- for img_file in uploaded_images:
575
- img = Image.open(img_file)
576
- st.image(img, caption=img_file.name)
577
- result = process_image_with_prompt(img, prompt)
578
- st.text_area(f"Result for {img_file.name}", result)
579
-
580
- with tab_audio:
581
- st.header("Audio Process 🎵")
582
- audio_bytes = audio_recorder()
583
- if audio_bytes:
584
- filename = generate_filename("recording", "wav")
585
- with open(filename, "wb") as f:
586
- f.write(audio_bytes)
587
- st.audio(filename)
588
- transcript, summary = process_audio(filename, "Summarize this audio in markdown")
589
- st.text_area("Transcript", transcript)
590
- st.markdown(summary)
591
-
592
- with tab_video:
593
- st.header("Video Process 🎥")
594
- video_input = st.file_uploader("Upload Video", type=["mp4"])
595
- if video_input:
596
- video_path = generate_filename(video_input.name, "mp4")
597
- with open(video_path, "wb") as f:
598
- f.write(video_input.read())
599
- st.video(video_path)
600
- result = process_video(video_path, "Summarize this video in markdown")
601
- st.markdown(result)
602
-
603
- with tab_code:
604
- st.header("Code Executor 🧑‍💻")
605
- code_input = st.text_area("Python Code", height=400)
606
- if st.button("Run Code"):
607
- output, error = execute_code(code_input)
608
- if error:
609
- st.error(f"Error: {error}")
610
- else:
611
- st.success(f"Output: {output or 'No output'}")
612
 
613
- with tab_gallery:
614
- st.header("Gallery 📚")
615
- all_files = get_gallery_files()
616
- for file in all_files:
617
- if file.endswith('.png'):
618
- st.image(Image.open(file), caption=file)
619
- elif file.endswith('.pdf'):
620
- doc = fitz.open(file)
621
- pix = doc[0].get_pixmap(matrix=fitz.Matrix(0.5, 0.5))
622
- st.image(Image.frombytes("RGB", [pix.width, pix.height], pix.samples), caption=file)
623
- doc.close()
624
- elif file.endswith('.md'):
625
- with open(file, "r") as f:
626
- st.markdown(f.read())
627
- elif file.endswith('.wav'):
628
- st.audio(file)
629
- elif file.endswith('.mp4'):
630
- st.video(file)
631
-
632
- with tab_search:
633
- st.header("ArXiv Search 🔎")
634
- query = st.text_input("Search ArXiv", "")
635
- if query:
636
- result = search_arxiv(query)
637
- st.markdown(result)
638
-
639
- with tab_glossary:
640
- st.header("Glossary 📖")
641
- display_glossary_grid(roleplaying_glossary)
642
-
643
- # Update gallery in sidebar
644
  def update_gallery():
645
- container = st.session_state['asset_gallery_container']
646
- container.empty()
647
  all_files = get_gallery_files()
648
  if all_files:
649
- container.markdown("### Asset Gallery 📸📖")
650
  cols = container.columns(2)
651
- for idx, file in enumerate(all_files[:st.session_state['gallery_size']]):
652
  with cols[idx % 2]:
653
  if file.endswith('.png'):
654
- st.image(Image.open(file), caption=os.path.basename(file))
655
- elif file.endswith('.pdf'):
656
- doc = fitz.open(file)
657
- pix = doc[0].get_pixmap(matrix=fitz.Matrix(0.5, 0.5))
658
- st.image(Image.frombytes("RGB", [pix.width, pix.height], pix.samples), caption=os.path.basename(file))
659
- doc.close()
660
- st.checkbox("Select", key=f"asset_{file}", value=st.session_state['asset_checkboxes'].get(file, False))
661
- st.markdown(get_download_link(file, "application/octet-stream", "Download"), unsafe_allow_html=True)
662
- if st.button("Delete", key=f"delete_{file}"):
663
  os.remove(file)
664
- st.session_state['asset_checkboxes'].pop(file, None)
665
  st.experimental_rerun()
666
 
667
  update_gallery()
668
-
669
- # Sidebar logs and history
670
- st.sidebar.subheader("Action Logs 📜")
671
  for record in log_records:
672
  st.sidebar.write(f"{record.asctime} - {record.levelname} - {record.message}")
673
- st.sidebar.subheader("History 📜")
674
- for entry in st.session_state.get("history", []):
675
- if entry:
676
- st.sidebar.write(entry)
677
 
678
- # Chatbot
679
- if prompt := st.chat_input("GPT-4o Multimodal ChatBot - What can I help you with?"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
680
  st.session_state.messages.append({"role": "user", "content": prompt})
681
  with st.chat_message("user"):
682
  st.markdown(prompt)
683
  with st.chat_message("assistant"):
684
- completion = client.chat.completions.create(model="gpt-4o-2024-05-13", messages=st.session_state.messages, stream=True)
685
- response = ""
686
- for chunk in completion:
687
- if chunk.choices[0].delta.content:
688
- response += chunk.choices[0].delta.content
689
- st.write(response)
690
- st.session_state.messages.append({"role": "assistant", "content": response})
 
1
+ #!/usr/bin/env python
2
+ """
3
+ Combined Multimodal AI Suite
4
+ - TorchTransformers-Diffusion-CV-SFT functionality (Camera, PDF, OCR, diffusion image gen, etc.)
5
+ - GPT-4o Omni: Text, Audio, Image, Video processing with chat and paper search
6
+ - Python Code Interpreter for code generation and execution
7
+
8
+ This app integrates all modalities and adds an “Integrated Workflow” tab that enables you to:
9
+ • Upload documents (e.g. double-page papers)
10
+ • Extract text via OCR and image processing
11
+ • Prompt GPT to generate Python code based on the extracted text
12
+ • Display and execute the generated code
13
+
14
+ Developed with Streamlit.
15
+ """
16
+
17
  import aiofiles
18
  import asyncio
19
  import base64
 
20
  import fitz
21
  import glob
 
 
22
  import logging
 
 
23
  import os
24
  import pandas as pd
25
  import pytz
 
28
  import requests
29
  import shutil
30
  import streamlit as st
 
 
 
31
  import time
 
32
  import torch
33
  import zipfile
34
 
 
 
 
 
35
  from dataclasses import dataclass
36
  from datetime import datetime
37
  from diffusers import StableDiffusionPipeline
 
 
38
  from io import BytesIO
 
39
  from openai import OpenAI
40
  from PIL import Image
 
41
  from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
42
  from typing import Optional
43
+
44
+ # --- Additional Imports from GPT-4o Omni ---
45
+ import cv2
46
+ import json
47
+ import streamlit.components.v1 as components
48
+ import textract
49
+ from audio_recorder_streamlit import audio_recorder
50
+ from bs4 import BeautifulSoup
51
+ from collections import deque
52
+ from dotenv import load_dotenv
53
+ from gradio_client import Client, handle_file
54
+ from huggingface_hub import InferenceClient
55
+ from moviepy import VideoFileClip
56
  from urllib.parse import quote
57
  from xml.etree import ElementTree as ET
58
+ import openai
59
 
60
+ # --- Code Interpreter Imports ---
61
+ import io
62
+ import sys
63
+ from contextlib import redirect_stdout
64
+ import mistune
65
 
66
+ # Load environment variables
67
+ load_dotenv()
 
 
 
 
 
 
68
 
69
+ # ------------------ Global Configuration ------------------
70
  st.set_page_config(
71
+ page_title="Combined Multimodal AI Suite 🚀",
72
  page_icon="🤖",
73
  layout="wide",
74
  initial_sidebar_state="expanded",
75
  menu_items={
76
  'Get Help': 'https://huggingface.co/awacke1',
77
  'Report a Bug': 'https://huggingface.co/spaces/awacke1',
78
+ 'About': "Combined Multimodal AI Suite: Camera, OCR, Chat, Code Generation & Execution"
79
  }
80
  )
81
 
82
+ # Setup logging
83
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
84
+ logger = logging.getLogger(__name__)
85
+ log_records = []
86
+ class LogCaptureHandler(logging.Handler):
87
+ def emit(self, record):
88
+ log_records.append(record)
89
+ logger.addHandler(LogCaptureHandler())
90
+
91
+ # ------------------ Session State Defaults ------------------
92
+ if 'history' not in st.session_state:
93
+ st.session_state.history = []
94
+ if 'messages' not in st.session_state:
95
+ st.session_state.messages = []
96
+ if 'gallery_files' not in st.session_state:
97
+ st.session_state.gallery_files = []
98
+ if 'builder' not in st.session_state:
99
+ st.session_state.builder = None
100
+ if 'model_loaded' not in st.session_state:
101
+ st.session_state.model_loaded = False
102
+ if 'processing' not in st.session_state:
103
+ st.session_state.processing = {}
104
+ if 'asset_checkboxes' not in st.session_state:
105
+ st.session_state.asset_checkboxes = {}
106
+ if 'downloaded_pdfs' not in st.session_state:
107
+ st.session_state.downloaded_pdfs = {}
108
+ if 'unique_counter' not in st.session_state:
109
+ st.session_state.unique_counter = 0
110
+
111
+ # ------------------ Utility Functions ------------------
112
+ def generate_filename(prompt, file_type):
113
+ """Generates a safe filename based on prompt and file type."""
114
+ central = pytz.timezone('US/Central')
115
+ safe_date_time = datetime.now(central).strftime("%m%d_%H%M")
116
+ replaced_prompt = prompt.replace(" ", "_").replace("\n", "_")
117
+ safe_prompt = "".join(x for x in replaced_prompt if x.isalnum() or x == "_")[:90]
118
+ return f"{safe_date_time}_{safe_prompt}.{file_type}"
119
+
120
+ def get_download_link(file_path, mime_type="application/octet-stream", label="Download"):
121
+ with open(file_path, "rb") as f:
122
+ b64 = base64.b64encode(f.read()).decode()
123
+ return f'<a href="data:{mime_type};base64,{b64}" download="{os.path.basename(file_path)}">{label}</a>'
124
+
125
+ def zip_directory(directory_path, zip_path):
126
+ with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
127
+ for root, _, files in os.walk(directory_path):
128
+ for file in files:
129
+ zipf.write(os.path.join(root, file),
130
+ os.path.relpath(os.path.join(root, file), os.path.dirname(directory_path)))
131
+
132
+ def get_gallery_files(file_types=["png", "pdf", "md"]):
133
+ return sorted(list({f for ext in file_types for f in glob.glob(f"*.{ext}")}))
134
+
135
+ def download_pdf(url, output_path):
136
+ try:
137
+ response = requests.get(url, stream=True, timeout=10)
138
+ if response.status_code == 200:
139
+ with open(output_path, "wb") as f:
140
+ for chunk in response.iter_content(chunk_size=8192):
141
+ f.write(chunk)
142
+ return True
143
+ except requests.RequestException as e:
144
+ logger.error(f"Failed to download {url}: {e}")
145
+ return False
146
+
147
+ # ------------------ Model & Diffusion Builders ------------------
148
  @dataclass
149
  class ModelConfig:
150
  name: str
 
153
  domain: Optional[str] = None
154
  model_type: str = "causal_lm"
155
  @property
156
+ def model_path(self):
157
  return f"models/{self.name}"
158
 
159
  @dataclass
 
177
  "I told my neural network a joke; it couldn't stop dropping bits! 🤖"
178
  ]
179
  def load_model(self, model_path: str, config: Optional[ModelConfig] = None):
180
+ with st.spinner(f"Loading model from {model_path}..."):
181
  self.model = AutoModelForCausalLM.from_pretrained(model_path)
182
  self.tokenizer = AutoTokenizer.from_pretrained(model_path)
183
  if self.tokenizer.pad_token is None:
184
  self.tokenizer.pad_token = self.tokenizer.eos_token
185
  if config:
186
  self.config = config
187
+ device = "cuda" if torch.cuda.is_available() else "cpu"
188
+ self.model.to(device)
189
+ st.success(f"Model loaded! {random.choice(self.jokes)}")
190
  return self
191
  def save_model(self, path: str):
192
+ with st.spinner("Saving model..."):
193
  os.makedirs(os.path.dirname(path), exist_ok=True)
194
  self.model.save_pretrained(path)
195
  self.tokenizer.save_pretrained(path)
196
+ st.success(f"Model saved at {path}!")
197
+
198
  class DiffusionBuilder:
199
  def __init__(self):
200
  self.config = None
201
  self.pipeline = None
202
  def load_model(self, model_path: str, config: Optional[DiffusionConfig] = None):
203
+ with st.spinner(f"Loading diffusion model from {model_path}..."):
204
  self.pipeline = StableDiffusionPipeline.from_pretrained(model_path, torch_dtype=torch.float32).to("cpu")
205
  if config:
206
  self.config = config
207
+ st.success("Diffusion model loaded!")
208
  return self
209
  def save_model(self, path: str):
210
+ with st.spinner("Saving diffusion model..."):
211
  os.makedirs(os.path.dirname(path), exist_ok=True)
212
  self.pipeline.save_pretrained(path)
213
+ st.success(f"Diffusion model saved at {path}!")
214
  def generate(self, prompt: str):
215
  return self.pipeline(prompt, num_inference_steps=20).images[0]
216
 
217
+ # ------------------ OCR & Image Processing Functions ------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
218
  async def process_ocr(image, output_file):
219
  start_time = time.time()
220
  status = st.empty()
221
+ status.text("Processing OCR... (0s)")
222
  tokenizer = AutoTokenizer.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True)
223
  model = AutoModel.from_pretrained("ucaslcl/GOT-OCR2_0", trust_remote_code=True, torch_dtype=torch.float32).to("cpu").eval()
224
+ temp_file = f"temp_{int(time.time())}.png"
225
  image.save(temp_file)
226
  result = model.chat(tokenizer, temp_file, ocr_type='ocr')
227
  os.remove(temp_file)
228
  elapsed = int(time.time() - start_time)
229
+ status.text(f"OCR completed in {elapsed}s!")
230
  async with aiofiles.open(output_file, "w") as f:
231
  await f.write(result)
232
  return result
 
234
  async def process_image_gen(prompt, output_file):
235
  start_time = time.time()
236
  status = st.empty()
237
+ status.text("Generating image... (0s)")
238
+ # Use diffusion builder from session if available; otherwise load a default
239
+ if st.session_state.get('builder') and isinstance(st.session_state.builder, DiffusionBuilder):
240
+ pipeline = st.session_state.builder.pipeline
241
+ else:
242
+ pipeline = StableDiffusionPipeline.from_pretrained("OFA-Sys/small-stable-diffusion-v0", torch_dtype=torch.float32).to("cpu")
243
  gen_image = pipeline(prompt, num_inference_steps=20).images[0]
244
  elapsed = int(time.time() - start_time)
245
+ status.text(f"Image generation completed in {elapsed}s!")
246
  gen_image.save(output_file)
247
  return gen_image
248
 
 
250
  buffered = BytesIO()
251
  image.save(buffered, format="PNG")
252
  img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
253
+ messages = [{
254
+ "role": "user",
255
+ "content": [
256
+ {"type": "text", "text": prompt},
257
+ {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_str}", "detail": detail}}
258
+ ]
259
+ }]
260
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
261
  try:
262
  response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
263
  return response.choices[0].message.content
264
  except Exception as e:
265
+ return f"Error: {str(e)}"
266
 
267
  def process_text_with_prompt(text, prompt, model="gpt-4o-mini"):
268
  messages = [{"role": "user", "content": f"{prompt}\n\n{text}"}]
269
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
270
  try:
271
  response = client.chat.completions.create(model=model, messages=messages, max_tokens=300)
272
  return response.choices[0].message.content
273
  except Exception as e:
274
+ return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
 
276
+ # ------------------ PDF Processing Functions ------------------
277
+ async def process_pdf_snapshot(pdf_path, mode="single"):
278
+ start_time = time.time()
279
+ status = st.empty()
280
+ status.text(f"Processing PDF Snapshot ({mode})... (0s)")
281
  try:
282
+ doc = fitz.open(pdf_path)
283
+ output_files = []
284
+ if mode == "single":
285
+ page = doc[0]
286
+ pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
287
+ output_file = generate_filename("single_snapshot", "png")
288
+ pix.save(output_file)
289
+ output_files.append(output_file)
290
+ elif mode == "twopage":
291
+ for i in range(min(2, len(doc))):
292
+ page = doc[i]
293
+ pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
294
+ output_file = generate_filename(f"twopage_{i}", "png")
295
+ pix.save(output_file)
296
+ output_files.append(output_file)
297
+ elif mode == "allpages":
298
+ for i in range(len(doc)):
299
+ page = doc[i]
300
+ pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
301
+ output_file = generate_filename(f"page_{i}", "png")
302
+ pix.save(output_file)
303
+ output_files.append(output_file)
304
+ doc.close()
305
+ elapsed = int(time.time() - start_time)
306
+ status.text(f"PDF Snapshot ({mode}) completed in {elapsed}s!")
307
+ return output_files
308
  except Exception as e:
309
+ status.error(f"Error: {str(e)}")
310
+ return []
 
 
 
 
 
 
311
 
312
+ # ------------------ GPT & Chat Functions ------------------
313
+ def process_text(text_input):
314
+ if text_input:
315
+ st.session_state.messages.append({"role": "user", "content": text_input})
316
+ with st.chat_message("user"):
317
+ st.markdown(text_input)
318
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
319
+ with st.chat_message("assistant"):
320
+ completion = client.chat.completions.create(
321
+ model="gpt-4o-2024-05-13",
322
+ messages=st.session_state.messages,
323
+ stream=False
324
+ )
325
+ return_text = completion.choices[0].message.content
326
+ st.write("Assistant: " + return_text)
327
+ st.session_state.messages.append({"role": "assistant", "content": return_text})
328
+ return return_text
329
+
330
+ def process_text2(text_input, model="gpt-4o-2024-05-13"):
331
+ if text_input:
332
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
333
+ completion = client.chat.completions.create(
334
+ model=model,
335
+ messages=st.session_state.messages,
336
+ stream=False
337
+ )
338
+ return_text = completion.choices[0].message.content
339
+ st.write("Assistant: " + return_text)
340
+ st.session_state.messages.append({"role": "assistant", "content": return_text})
341
+ return return_text
342
+
343
+ # ------------------ Audio & Video Processing Functions ------------------
344
  def SpeechSynthesis(result):
345
  documentHTML5 = f'''
346
  <!DOCTYPE html>
 
365
  '''
366
  components.html(documentHTML5, width=1280, height=300)
367
 
368
+ def process_audio(audio_input, text_input=''):
369
+ if audio_input:
370
+ # Save and read audio bytes
371
+ with open("temp_audio.wav", "wb") as file:
372
+ file.write(audio_input.getvalue())
373
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
374
+ transcription = client.audio.transcriptions.create(model="whisper-1", file=open("temp_audio.wav", "rb"))
375
+ st.session_state.messages.append({"role": "user", "content": transcription.text})
376
+ with st.chat_message("assistant"):
377
+ st.markdown(transcription.text)
378
+ SpeechSynthesis(transcription.text)
379
+ filename = generate_filename(transcription.text, "md")
380
+ with open(filename, "w", encoding="utf-8") as f:
381
+ f.write(transcription.text)
382
+ return transcription.text
383
+
384
+ def process_video_and_audio(video_input):
385
+ if video_input:
386
+ # Save video file
387
+ video_path = video_input.name
388
+ with open(video_path, "wb") as f:
389
+ f.write(video_input.getbuffer())
390
+ # Extract frames
391
+ base64Frames = []
392
+ video = cv2.VideoCapture(video_path)
393
+ total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
394
+ fps = video.get(cv2.CAP_PROP_FPS)
395
+ frames_to_skip = int(fps * 1) # 1 second per frame
396
+ curr_frame = 0
397
+ while curr_frame < total_frames - 1:
398
+ video.set(cv2.CAP_PROP_POS_FRAMES, curr_frame)
399
+ success, frame = video.read()
400
+ if not success:
401
+ break
402
+ _, buffer = cv2.imencode(".jpg", frame)
403
+ base64Frames.append(base64.b64encode(buffer).decode("utf-8"))
404
+ curr_frame += frames_to_skip
405
+ video.release()
406
+ # Audio transcription from video
407
+ try:
408
+ clip = VideoFileClip(video_path)
409
+ audio_path = f"{os.path.splitext(video_path)[0]}.mp3"
410
+ clip.audio.write_audiofile(audio_path, bitrate="32k")
411
+ clip.audio.close()
412
+ clip.close()
413
+ with open(audio_path, "rb") as f:
414
+ audio_data = f.read()
415
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'), organization=os.getenv('OPENAI_ORG_ID'))
416
+ transcription = client.audio.transcriptions.create(model="whisper-1", file=BytesIO(audio_data))
417
+ except Exception as e:
418
+ transcription = type("Dummy", (), {"text": "No transcript available."})()
419
+ # Display frames and transcript
420
+ st.markdown("### Video Frames")
421
+ for frame_b64 in base64Frames:
422
+ st.image(f"data:image/jpg;base64,{frame_b64}", use_column_width=True)
423
+ st.markdown("### Audio Transcription")
424
+ st.write(transcription.text)
425
+ return transcription.text
426
+
427
+ # ------------------ Python Code Executor Functions ------------------
428
+ def extract_python_code(markdown_text):
429
+ pattern = r"```python\s*(.*?)\s*```"
430
+ matches = re.findall(pattern, markdown_text, re.DOTALL)
431
+ return matches
432
 
433
+ def execute_code(code):
434
+ buffer = io.StringIO()
435
+ local_vars = {}
436
+ try:
437
+ with redirect_stdout(buffer):
438
+ exec(code, {}, local_vars)
439
+ output = buffer.getvalue()
440
+ return output, None
441
+ except Exception as e:
442
+ return None, str(e)
443
+ finally:
444
+ buffer.close()
445
+
446
+ def create_and_save_file(filename, prompt, response, should_save=True):
447
+ if not should_save:
448
+ return
449
+ base_filename, ext = os.path.splitext(filename)
450
+ if ext in ['.txt', '.htm', '.md']:
451
+ with open(f"{base_filename}.md", 'w', encoding='utf-8') as file:
452
+ file.write(response)
453
+
454
+ # ------------------ Integrated Workflow Function ------------------
455
+ def integrated_workflow():
456
+ st.header("Integrated Workflow: From Paper to Code")
457
+ st.markdown("""
458
+ 1. **Upload a PDF or Image** of a paper (double-page images work best).
459
+ 2. **Run OCR** to extract text.
460
+ 3. **Generate Python Code** based on the extracted text using GPT.
461
+ 4. **Review and Execute** the generated code.
462
+ """)
463
+ uploaded_file = st.file_uploader("Upload PDF or Image", type=["pdf", "png", "jpg", "jpeg"], key="integrated_file")
464
+ if uploaded_file:
465
+ # Save the uploaded file
466
+ file_path = f"uploaded_{uploaded_file.name}"
467
+ with open(file_path, "wb") as f:
468
+ f.write(uploaded_file.getvalue())
469
+ st.success(f"Uploaded file saved as {file_path}")
470
+ # If PDF, show first page snapshot; if image, load directly.
471
+ if uploaded_file.type == "application/pdf":
472
+ mode = st.selectbox("Snapshot Mode", ["single", "twopage", "allpages"])
473
+ snapshots = asyncio.run(process_pdf_snapshot(file_path, mode))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
  for snapshot in snapshots:
475
+ st.image(Image.open(snapshot), caption=f"Snapshot: {snapshot}", use_column_width=True)
476
+ else:
477
+ st.image(Image.open(file_path), caption="Uploaded Image", use_column_width=True)
478
+ # Run OCR on the file (using first page or the image itself)
479
+ if st.button("Run OCR on File"):
480
+ if uploaded_file.type == "application/pdf":
481
+ doc = fitz.open(file_path)
482
+ page = doc[0]
483
+ pix = page.get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
484
+ temp_img = f"ocr_{os.path.basename(file_path)}.png"
485
+ pix.save(temp_img)
486
+ doc.close()
487
+ image = Image.open(temp_img)
488
+ else:
489
+ image = Image.open(file_path)
490
+ ocr_output_file = generate_filename("ocr_output", "txt")
491
+ ocr_result = asyncio.run(process_ocr(image, ocr_output_file))
492
+ st.text_area("OCR Output", ocr_result, height=200)
493
+ # Use extracted OCR text as prompt to generate python code
494
+ st.markdown("### Generate Python Code from OCR Text")
495
+ code_prompt = st.text_area("Edit Prompt for Code Generation", value=f"Generate a Python script that processes the following scientific text:\n\n{ocr_result}", height=200)
496
+ if st.button("Generate Code"):
497
+ code_generated = process_text_with_prompt(ocr_result, code_prompt, model="gpt-4o-mini")
498
+ st.code(code_generated, language="python")
499
+ # Save generated code
500
+ code_filename = generate_filename("generated_code", "py")
501
+ with open(code_filename, "w", encoding="utf-8") as f:
502
+ f.write(code_generated)
503
+ st.markdown(get_download_link(code_filename, "text/plain", "Download Generated Code"), unsafe_allow_html=True)
504
+ # Optionally execute the generated code
505
+ if st.button("Execute Generated Code"):
506
+ output, error = execute_code(code_generated)
507
  if error:
508
+ st.error(f"Error executing code:\n{error}")
509
  else:
510
+ st.success("Code executed successfully. Output:")
511
+ st.code(output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
512
 
513
+ # ------------------ Sidebar: Asset Gallery & Logs ------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  def update_gallery():
515
+ container = st.sidebar.empty()
 
516
  all_files = get_gallery_files()
517
  if all_files:
518
+ container.markdown("### Asset Gallery")
519
  cols = container.columns(2)
520
+ for idx, file in enumerate(all_files[:st.session_state.get('gallery_size', 5)]):
521
  with cols[idx % 2]:
522
  if file.endswith('.png'):
523
+ st.image(Image.open(file), caption=os.path.basename(file), use_column_width=True)
524
+ else:
525
+ st.markdown(os.path.basename(file))
526
+ if st.button("Delete "+os.path.basename(file), key="del_"+file):
 
 
 
 
 
527
  os.remove(file)
 
528
  st.experimental_rerun()
529
 
530
  update_gallery()
531
+ st.sidebar.subheader("Action Logs")
 
 
532
  for record in log_records:
533
  st.sidebar.write(f"{record.asctime} - {record.levelname} - {record.message}")
 
 
 
 
534
 
535
+ # ------------------ Main App Navigation ------------------
536
+ st.title("Combined Multimodal AI Suite")
537
+
538
+ tabs = st.tabs(["Home", "Camera & Images", "PDF & Documents", "Multimodal Chat", "Code Executor", "Integrated Workflow"])
539
+
540
+ # --- Home Tab ---
541
+ with tabs[0]:
542
+ st.header("Welcome to the Combined Multimodal AI Suite")
543
+ st.markdown("""
544
+ This application integrates multiple AI functionalities:
545
+
546
+ - **Camera & Image Processing:** Capture images, generate new images using diffusion models.
547
+ - **PDF & Document Processing:** Download PDFs, perform OCR, and generate markdown summaries.
548
+ - **Multimodal Chat:** Chat with GPT-4o using text, audio, image, and video inputs.
549
+ - **Code Executor:** Write, generate, and execute Python code interactively.
550
+ - **Integrated Workflow:** Seamlessly extract text from papers and generate & run Python code.
551
+
552
+ Use the tabs above to explore each modality.
553
+ """)
554
+
555
+ # --- Camera & Images Tab ---
556
+ with tabs[1]:
557
+ st.header("Camera & Image Processing")
558
+ st.subheader("Capture and Process Images")
559
+ col1, col2 = st.columns(2)
560
+ with col1:
561
+ cam0_img = st.camera_input("Take a picture - Cam 0", key="cam0")
562
+ if cam0_img:
563
+ filename = generate_filename("cam0_snapshot", "png")
564
+ with open(filename, "wb") as f:
565
+ f.write(cam0_img.getvalue())
566
+ st.image(Image.open(filename), caption="Camera 0 Snapshot", use_column_width=True)
567
+ st.session_state.history.append(f"Captured {filename}")
568
+ with col2:
569
+ cam1_img = st.camera_input("Take a picture - Cam 1", key="cam1")
570
+ if cam1_img:
571
+ filename = generate_filename("cam1_snapshot", "png")
572
+ with open(filename, "wb") as f:
573
+ f.write(cam1_img.getvalue())
574
+ st.image(Image.open(filename), caption="Camera 1 Snapshot", use_column_width=True)
575
+ st.session_state.history.append(f"Captured {filename}")
576
+ st.markdown("---")
577
+ st.subheader("Generate New Image with Diffusion")
578
+ prompt_img = st.text_input("Enter prompt for image generation", "A neon futuristic cityscape")
579
+ if st.button("Generate Image"):
580
+ output_file = generate_filename("gen_output", "png")
581
+ result_img = asyncio.run(process_image_gen(prompt_img, output_file))
582
+ st.image(result_img, caption="Generated Image", use_column_width=True)
583
+
584
+ # --- PDF & Documents Tab ---
585
+ with tabs[2]:
586
+ st.header("PDF & Document Processing")
587
+ st.subheader("Download and Process PDFs")
588
+ url_input = st.text_area("Enter PDF URLs (one per line)", height=100)
589
+ if st.button("Download PDFs"):
590
+ urls = [u.strip() for u in url_input.splitlines() if u.strip()]
591
+ progress_bar = st.progress(0)
592
+ for idx, url in enumerate(urls):
593
+ output_path = generate_filename(url, "pdf")
594
+ if download_pdf(url, output_path):
595
+ st.session_state.downloaded_pdfs[url] = output_path
596
+ st.success(f"Downloaded: {output_path}")
597
+ progress_bar.progress((idx + 1) / len(urls))
598
+ st.markdown("---")
599
+ st.subheader("OCR & PDF Snapshot")
600
+ all_assets = get_gallery_files()
601
+ selected_asset = st.selectbox("Select an asset", all_assets) if all_assets else None
602
+ if selected_asset and st.button("Run OCR on Selected"):
603
+ if selected_asset.endswith('.png'):
604
+ image = Image.open(selected_asset)
605
+ else:
606
+ doc = fitz.open(selected_asset)
607
+ pix = doc[0].get_pixmap(matrix=fitz.Matrix(2.0, 2.0))
608
+ image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
609
+ doc.close()
610
+ output_file = generate_filename("ocr_output", "txt")
611
+ ocr_result = asyncio.run(process_ocr(image, output_file))
612
+ st.text_area("OCR Result", ocr_result, height=200)
613
+ st.markdown("---")
614
+ st.subheader("Markdown Gallery")
615
+ md_files = sorted(glob.glob("*.md"))
616
+ if md_files:
617
+ for md in md_files:
618
+ st.markdown(f"**{md}**")
619
+ st.markdown(get_download_link(md, "text/markdown", "Download MD"), unsafe_allow_html=True)
620
+
621
+ # --- Multimodal Chat Tab ---
622
+ with tabs[3]:
623
+ st.header("Multimodal Chat")
624
+ st.markdown("Chat with GPT-4o using text, audio, image, or video inputs.")
625
+ mode = st.selectbox("Select Mode", ["Text", "Image", "Audio", "Video"])
626
+ if mode == "Text":
627
+ text_input = st.text_input("Enter your text prompt")
628
+ if st.button("Send Text"):
629
+ response = process_text(text_input)
630
+ st.markdown(response)
631
+ elif mode == "Image":
632
+ text_prompt = st.text_input("Enter prompt for image analysis", "Describe this image and list 10 facts.")
633
+ image_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"], key="chat_image")
634
+ if image_file:
635
+ image = Image.open(image_file)
636
+ st.image(image, caption="Uploaded Image", use_column_width=True)
637
+ response = process_image_with_prompt(image, text_prompt)
638
+ st.markdown(response)
639
+ elif mode == "Audio":
640
+ st.markdown("Record or upload an audio file for transcription.")
641
+ audio_bytes = audio_recorder()
642
+ if audio_bytes:
643
+ st.audio(audio_bytes, format="audio/wav")
644
+ transcription = process_audio(audio_bytes)
645
+ st.markdown(transcription)
646
+ elif mode == "Video":
647
+ video_file = st.file_uploader("Upload a video file", type=["mp4", "webm"], key="chat_video")
648
+ if video_file:
649
+ transcript = process_video_and_audio(video_file)
650
+ st.markdown("Video Transcript:")
651
+ st.write(transcript)
652
+
653
+ st.markdown("---")
654
+ st.subheader("Chat History")
655
+ for msg in st.session_state.messages:
656
+ with st.chat_message(msg["role"]):
657
+ st.markdown(msg["content"])
658
+
659
+ # --- Code Executor Tab ---
660
+ with tabs[4]:
661
+ st.header("Python Code Executor")
662
+ st.markdown("Enter Python code below or upload a .py/.md file. The code will be executed in a sandboxed environment.")
663
+ uploaded_file = st.file_uploader("Upload Python (.py) or Markdown (.md) file", type=["py", "md"], key="code_file")
664
+ if 'code' not in st.session_state:
665
+ st.session_state.code = """import streamlit as st
666
+ st.write("Hello from the Python Code Executor!")"""
667
+ if uploaded_file is None:
668
+ code_input = st.text_area("Python Code Editor:", value=st.session_state.code, height=400, key="code_editor")
669
+ else:
670
+ content = uploaded_file.getvalue().decode()
671
+ if uploaded_file.type == "text/markdown":
672
+ code_blocks = extract_python_code(content)
673
+ if code_blocks:
674
+ code_input = code_blocks[0]
675
+ else:
676
+ st.error("No Python code block found in the markdown file!")
677
+ code_input = ""
678
+ else:
679
+ code_input = content
680
+ st.code(code_input, language='python')
681
+ col1, col2 = st.columns([1,1])
682
+ with col1:
683
+ if st.button("▶️ Run Code"):
684
+ if code_input:
685
+ output, error = execute_code(code_input)
686
+ if error:
687
+ st.error(f"Error:\n{error}")
688
+ elif output:
689
+ st.code(output)
690
+ else:
691
+ st.success("Code executed with no output.")
692
+ else:
693
+ st.warning("Please enter some code!")
694
+ with col2:
695
+ if st.button("🗑️ Clear Code"):
696
+ st.session_state.code = ""
697
+ st.experimental_rerun()
698
+ with st.expander("How to use the Code Executor"):
699
+ st.markdown("""
700
+ - Enter or upload Python code.
701
+ - Click **Run Code** to execute.
702
+ - The output (or any errors) will be displayed below.
703
+ """)
704
+
705
+ # --- Integrated Workflow Tab ---
706
+ with tabs[5]:
707
+ integrated_workflow()
708
+
709
+ # ------------------ Chat Input at Bottom ------------------
710
+ if prompt := st.chat_input("GPT-4o Multimodal ChatBot - How can I help you?"):
711
  st.session_state.messages.append({"role": "user", "content": prompt})
712
  with st.chat_message("user"):
713
  st.markdown(prompt)
714
  with st.chat_message("assistant"):
715
+ response = process_text2(prompt)
716
+ st.session_state.messages.append({"role": "assistant", "content": response})