mgbam commited on
Commit
9f48d45
Β·
verified Β·
1 Parent(s): 2d8777b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -202
app.py CHANGED
@@ -2,6 +2,7 @@ import streamlit as st
2
  import pdfplumber
3
  import pytesseract
4
  import openai
 
5
  import json
6
  import pandas as pd
7
  import numpy as np
@@ -11,227 +12,239 @@ import time
11
  import traceback
12
  import os
13
  import hashlib
 
14
 
15
- # Configuration
16
- SUPPORTED_MODELS = {
17
- "Deepseek": {
18
- "base_url": "https://api.deepseek.com/v1",
19
- "required_key": "DEEPSEEK_KEY"
20
- },
21
- "OpenAI": {
22
- "base_url": "https://api.openai.com/v1",
23
- "required_key": "OPENAI_KEY"
24
- }
25
- }
26
-
27
- def initialize_session():
28
- """Initialize session state with validation"""
29
- defaults = {
30
- 'processing_stage': 'idle',
31
- 'document_data': [],
32
- 'qa_pairs': [],
33
- 'export_formats': ['JSON', 'CSV', 'Parquet'],
34
- 'model_settings': {
35
- 'current_model': 'Deepseek',
36
- 'temperature': 0.3
37
- },
38
- 'api_keys': {}
39
- }
40
-
41
- for key, val in defaults.items():
42
- if key not in st.session_state:
43
- st.session_state[key] = val
44
-
45
- def handle_image_errors(img_stream):
46
- """Robust image processing with multiple fallbacks"""
47
- try:
48
- # First try standard RGB conversion
49
- return Image.frombytes("RGB", (img_stream['width'], img_stream['height']), img_stream.get_data())
50
- except:
 
51
  try:
52
- # Fallback to grayscale conversion
53
- return Image.frombytes("L", (img_stream['width'], img_stream['height']), img_stream.get_data()).convert("RGB")
 
 
 
 
 
 
54
  except Exception as e:
55
- st.error(f"Critical image error: {str(e)[:200]}")
56
- return None
57
 
58
- def process_pdf(uploaded_file):
59
- """PDF processing with enhanced error recovery"""
60
- st.session_state.processing_stage = 'extracting'
61
- doc_data = []
62
-
63
- try:
64
- with pdfplumber.open(uploaded_file) as pdf:
65
- for page_num, page in enumerate(pdf.pages, 1):
66
- page_data = {
67
- "page": page_num,
68
- "text": page.extract_text() or "",
69
- "images": []
70
- }
71
-
72
- # Process images with error containment
73
- for img_idx, img in enumerate(page.images):
74
- processed_img = handle_image_errors(img['stream'])
75
- if processed_img:
76
- page_data["images"].append(processed_img)
77
-
78
- doc_data.append(page_data)
79
- time.sleep(0.01) # Yield for UI updates
80
-
81
- st.session_state.document_data = doc_data
82
- return True
83
- except Exception as e:
84
- st.error(f"PDF processing failed: {str(e)}")
85
- return False
86
-
87
- def generate_qa_content():
88
- """Model-agnostic content generation"""
89
- st.session_state.processing_stage = 'generating'
90
- qa_pairs = []
91
-
92
- try:
93
- client = openai.OpenAI(
94
- base_url=SUPPORTED_MODELS[st.session_state.model_settings['current_model']]['base_url'],
95
- api_key=st.session_state.api_keys.get(
96
- SUPPORTED_MODELS[st.session_state.model_settings['current_model']]['required_key']
97
- )
98
- )
99
 
100
- for page in st.session_state.document_data:
101
- text_content = page['text'] or " ".join([
102
- pytesseract.image_to_string(img) for img in page['images']
103
- ])
 
 
 
 
104
 
105
- response = client.chat.completions.create(
106
- model="gpt-4-turbo" if st.session_state.model_settings['current_model'] == "OpenAI" else "deepseek-chat",
107
- messages=[{
108
- "role": "user",
109
- "content": f"Generate 3 Q&A pairs from this financial content:\n{text_content}\nOutput JSON format with keys: question, answer_1, answer_2"
110
- }],
111
- response_format={"type": "json_object"},
112
- temperature=st.session_state.model_settings['temperature']
113
- )
114
 
115
  try:
116
- result = json.loads(response.choices[0].message.content)
117
- qa_pairs.extend(result.get('qa_pairs', []))
118
- except json.JSONDecodeError:
119
- st.error("Failed to parse model response")
120
-
121
- st.session_state.qa_pairs = qa_pairs
122
- return True
123
- except Exception as e:
124
- st.error(f"Generation failed: {str(e)}")
125
- return False
126
-
127
- def export_data():
128
- """Multi-format export handler"""
129
- formats = st.session_state.export_formats
130
- timestamp = time.strftime("%Y%m%d-%H%M%S")
131
- base_name = f"wealth_report_{timestamp}"
132
- export_package = {}
133
-
134
- try:
135
- # JSON Export
136
- if 'JSON' in formats:
137
- export_package[f'{base_name}.json'] = json.dumps(
138
- st.session_state.qa_pairs,
139
- indent=2
140
- ).encode()
141
 
142
- # CSV Export
143
- if 'CSV' in formats:
144
- df = pd.DataFrame(st.session_state.qa_pairs)
145
- export_package[f'{base_name}.csv'] = df.to_csv(index=False).encode()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
- # Parquet Export
 
 
 
148
  if 'Parquet' in formats:
149
- df = pd.DataFrame(st.session_state.qa_pairs)
150
  buffer = BytesIO()
151
  df.to_parquet(buffer)
152
- export_package[f'{base_name}.parquet'] = buffer.getvalue()
153
 
154
- return export_package
155
- except Exception as e:
156
- st.error(f"Export failed: {str(e)}")
157
- return None
158
-
159
- def api_key_manager():
160
- """Secure API key management UI"""
161
- with st.sidebar.expander("πŸ”‘ API Key Management", expanded=True):
162
- for model in SUPPORTED_MODELS:
163
- key = st.text_input(
164
- f"{model} API Key",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
  type="password",
166
- key=f"key_{model}"
167
  )
168
- if key:
169
- st.session_state.api_keys[SUPPORTED_MODELS[model]['required_key']] = key
 
 
 
 
 
170
 
171
- def main_interface():
172
- """Core application interface"""
173
- st.title("Global Wealth Report Analyzer")
174
- st.write("Advanced financial document processing with multi-model AI support")
175
 
176
- # File Upload
177
- uploaded_file = st.file_uploader(
178
- "Upload PDF Report",
179
- type=["pdf"],
180
- accept_multiple_files=False
181
- )
182
 
183
- # Processing Controls
184
- if uploaded_file and st.button("Start Analysis"):
185
- if process_pdf(uploaded_file) and generate_qa_content():
186
- st.session_state.processing_stage = 'complete'
187
 
188
- # Results Display
189
- if st.session_state.processing_stage == 'complete':
190
- st.success("Analysis Complete!")
191
-
192
- # Data Export
193
- with st.expander("πŸ“¦ Export Results", expanded=True):
194
- cols = st.columns(3)
195
- with cols[0]:
196
- st.multiselect(
197
- "Export Formats",
198
- ['JSON', 'CSV', 'Parquet'],
199
- default=['JSON', 'CSV'],
200
- key='export_formats'
201
- )
202
- with cols[1]:
203
- st.download_button(
204
- "Download Results",
205
- data=export_data()['wealth_report.json'],
206
- file_name="wealth_report.zip",
207
- mime="application/zip",
208
- disabled=not st.session_state.qa_pairs
209
- )
210
-
211
- # Results Preview
212
- with st.expander("πŸ” View Generated Content"):
213
- st.dataframe(
214
- pd.DataFrame(st.session_state.qa_pairs),
215
- use_container_width=True,
216
- height=400
217
- )
218
-
219
- def model_settings():
220
- """Model configuration panel"""
221
- with st.sidebar.expander("🧠 AI Settings", expanded=True):
222
- st.selectbox(
223
- "AI Model",
224
- list(SUPPORTED_MODELS.keys()),
225
- key='model_settings.current_model'
226
- )
227
- st.slider(
228
- "Creativity Level",
229
- 0.0, 1.0, 0.3,
230
- key='model_settings.temperature'
231
- )
232
 
233
  if __name__ == "__main__":
234
- initialize_session()
235
- api_key_manager()
236
- model_settings()
237
- main_interface()
 
2
  import pdfplumber
3
  import pytesseract
4
  import openai
5
+ from openai import OpenAI
6
  import json
7
  import pandas as pd
8
  import numpy as np
 
12
  import traceback
13
  import os
14
  import hashlib
15
+ import groq
16
 
17
+ class SyntheticDataGenerator:
18
+ def __init__(self):
19
+ self.SUPPORTED_MODELS = {
20
+ "Deepseek": {
21
+ "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key),
22
+ "models": ["deepseek-chat"],
23
+ "key_name": "DEEPSEEK_KEY"
24
+ },
25
+ "OpenAI": {
26
+ "client": lambda key: OpenAI(api_key=key),
27
+ "models": ["gpt-4-turbo"],
28
+ "key_name": "OPENAI_KEY"
29
+ },
30
+ "Mistral-Groq": {
31
+ "client": lambda key: groq.Groq(api_key=key),
32
+ "models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
33
+ "key_name": "GROQ_KEY"
34
+ }
35
+ }
36
+ self.init_session()
37
+
38
+ def init_session(self):
39
+ if 'qa_pairs' not in st.session_state:
40
+ st.session_state.qa_pairs = []
41
+ if 'doc_data' not in st.session_state:
42
+ st.session_state.doc_data = []
43
+ if 'processing' not in st.session_state:
44
+ st.session_state.processing = {
45
+ 'stage': 'idle',
46
+ 'errors': [],
47
+ 'warnings': []
48
+ }
49
+
50
+ def process_pdf(self, uploaded_file):
51
+ """Robust PDF processing with advanced image handling"""
52
+ st.session_state.processing = {'stage': 'extracting', 'errors': [], 'warnings': []}
53
+
54
  try:
55
+ with pdfplumber.load(uploaded_file) as pdf:
56
+ for page_num, page in enumerate(pdf.pages, 1):
57
+ page_data = self._process_page(page, page_num)
58
+ st.session_state.doc_data.append(page_data)
59
+
60
+ if len(st.session_state.processing['errors']) > 0:
61
+ st.error(f"Processed with {len(st.session_state.processing['errors'])} errors")
62
+ return True
63
  except Exception as e:
64
+ self._log_error(f"PDF loading failed: {str(e)}")
65
+ return False
66
 
67
+ def _process_page(self, page, page_num):
68
+ """Process individual page with nested error handling"""
69
+ page_data = {"page": page_num, "text": "", "images": []}
70
+
71
+ try:
72
+ page_data["text"] = page.extract_text() or ""
73
+ except Exception as e:
74
+ self._log_error(f"Page {page_num} text extraction failed: {str(e)}")
75
+
76
+ try:
77
+ for img_idx, img in enumerate(page.images):
78
+ img_data = self._process_image(img, page_num, img_idx)
79
+ if img_data:
80
+ page_data["images"].append(img_data)
81
+ except Exception as e:
82
+ self._log_error(f"Page {page_num} image processing failed: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ return page_data
85
+
86
+ def _process_image(self, img, page_num, img_idx):
87
+ """Advanced image processing with multiple fallbacks"""
88
+ try:
89
+ stream = img['stream']
90
+ width = self._get_dimension(stream, 'width')
91
+ height = self._get_dimension(stream, 'height')
92
 
93
+ if width <= 0 or height <= 0:
94
+ raise ValueError("Invalid image dimensions")
 
 
 
 
 
 
 
95
 
96
  try:
97
+ return Image.frombytes("RGB", (width, height), stream.get_data())
98
+ except:
99
+ return Image.frombytes("L", (width, height), stream.get_data()).convert("RGB")
100
+ except Exception as e:
101
+ self._log_error(f"Page {page_num} image {img_idx} failed: {str(e)}")
102
+ return None
103
+
104
+ def _get_dimension(self, stream, dimension):
105
+ """Safe dimension extraction with multiple fallbacks"""
106
+ try:
107
+ return int(stream[dimension])
108
+ except:
109
+ try:
110
+ return int(stream['stream'][dimension])
111
+ except:
112
+ try:
113
+ return int(stream['data'][dimension])
114
+ except:
115
+ return 0
116
+
117
+ def generate_qa(self, model_provider, model_name, temperature):
118
+ """Multi-model generation engine"""
119
+ st.session_state.processing = {'stage': 'generating', 'errors': []}
120
+ qa_pairs = []
 
121
 
122
+ try:
123
+ client = self.SUPPORTED_MODELS[model_provider]["client"](
124
+ st.session_state[model_provider.lower() + "_key"]
125
+ )
126
+
127
+ for page in st.session_state.doc_data:
128
+ content = self._get_page_content(page)
129
+ response = self._generate(client, model_name, content, temperature)
130
+ qa_pairs.extend(self._parse_response(response))
131
+
132
+ st.session_state.qa_pairs = qa_pairs
133
+ return True
134
+ except Exception as e:
135
+ self._log_error(f"Generation failed: {str(e)}")
136
+ return False
137
+
138
+ def _generate(self, client, model, content, temp):
139
+ """Unified generation interface"""
140
+ if isinstance(client, groq.Groq):
141
+ return client.chat.completions.create(
142
+ messages=[{"role": "user", "content": content}],
143
+ model=model,
144
+ temperature=temp,
145
+ response_format={"type": "json_object"}
146
+ )
147
+ else:
148
+ return client.chat.completions.create(
149
+ model=model,
150
+ messages=[{"role": "user", "content": content}],
151
+ temperature=temp,
152
+ response_format={"type": "json_object"}
153
+ )
154
+
155
+ def _parse_response(self, response):
156
+ """Safe response parsing"""
157
+ try:
158
+ content = json.loads(response.choices[0].message.content)
159
+ return content.get('qa_pairs', [])
160
+ except Exception as e:
161
+ self._log_error(f"Response parsing failed: {str(e)}")
162
+ return []
163
+
164
+ def export_data(self, formats):
165
+ """Multi-format export system"""
166
+ exports = {}
167
+ df = pd.DataFrame(st.session_state.qa_pairs)
168
 
169
+ if 'JSON' in formats:
170
+ exports['synthetic_data.json'] = df.to_json(orient='records').encode()
171
+ if 'CSV' in formats:
172
+ exports['synthetic_data.csv'] = df.to_csv(index=False).encode()
173
  if 'Parquet' in formats:
 
174
  buffer = BytesIO()
175
  df.to_parquet(buffer)
176
+ exports['synthetic_data.parquet'] = buffer.getvalue()
177
 
178
+ return exports
179
+
180
+ def _log_error(self, message):
181
+ """Centralized error logging"""
182
+ st.session_state.processing['errors'].append(message)
183
+ st.error(message)
184
+
185
+ def _get_page_content(self, page):
186
+ """Multimodal content extraction"""
187
+ text = page["text"]
188
+ if not text:
189
+ text = " ".join([pytesseract.image_to_string(img) for img in page["images"]])
190
+ return text
191
+
192
+ def ui_setup():
193
+ """Enterprise-grade UI configuration"""
194
+ st.set_page_config(
195
+ page_title="Synthetic Data Factory Pro",
196
+ page_icon="🏭",
197
+ layout="wide",
198
+ initial_sidebar_state="expanded"
199
+ )
200
+
201
+ with st.sidebar:
202
+ st.header("πŸ”‘ API Key Management")
203
+ for provider in ["Deepseek", "OpenAI", "Mistral-Groq"]:
204
+ st.text_input(
205
+ f"{provider} API Key",
206
  type="password",
207
+ key=f"{provider.lower()}_key"
208
  )
209
+
210
+ st.header("🧠 AI Configuration")
211
+ provider = st.selectbox("Model Provider", ["Deepseek", "OpenAI", "Mistral-Groq"])
212
+ model = st.selectbox("Model", generator.SUPPORTED_MODELS[provider]["models"])
213
+ temp = st.slider("Temperature", 0.0, 1.0, 0.3)
214
+
215
+ return provider, model, temp
216
 
217
+ def main():
218
+ """Main application flow"""
219
+ provider, model, temp = ui_setup()
220
+ generator = SyntheticDataGenerator()
221
 
222
+ st.title("🏭 Synthetic Data Factory Pro")
223
+ st.write("Enterprise-grade document processing with multi-modal AI")
 
 
 
 
224
 
225
+ uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
 
 
 
226
 
227
+ if uploaded_file and st.button("Start Generation"):
228
+ if generator.process_pdf(uploaded_file):
229
+ if generator.generate_qa(provider, model, temp):
230
+ st.success("Generation completed successfully!")
231
+
232
+ with st.expander("πŸ“Š Results Preview"):
233
+ st.dataframe(pd.DataFrame(st.session_state.qa_pairs))
234
+
235
+ with st.expander("πŸ“¦ Advanced Export"):
236
+ formats = st.multiselect(
237
+ "Select formats",
238
+ ["JSON", "CSV", "Parquet"],
239
+ default=["JSON", "CSV"]
240
+ )
241
+ exports = generator.export_data(formats)
242
+
243
+ if st.download_button("Export Package",
244
+ data=json.dumps(exports),
245
+ file_name="synthetic_data.zip",
246
+ mime="application/zip"):
247
+ st.success("Export package generated!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
  if __name__ == "__main__":
250
+ main()