mgbam commited on
Commit
2d8777b
Β·
verified Β·
1 Parent(s): 218d2f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +169 -163
app.py CHANGED
@@ -1,231 +1,237 @@
1
- # app.py
2
  import streamlit as st
3
  import pdfplumber
4
  import pytesseract
5
- from PIL import Image
6
- import os
7
- import json
8
  import openai
 
9
  import pandas as pd
10
  import numpy as np
 
11
  from io import BytesIO
12
- from concurrent.futures import ThreadPoolExecutor
13
- import hashlib
14
  import time
15
  import traceback
 
 
16
 
17
  # Configuration
18
- MAX_THREADS = 4
19
  SUPPORTED_MODELS = {
20
  "Deepseek": {
21
- "model": "deepseek-chat",
22
- "base_url": "https://api.deepseek.com/v1"
 
 
 
 
23
  }
24
  }
25
 
26
- def debug_log(message):
27
- """Enhanced logging system"""
28
- if st.session_state.get("debug_mode"):
29
- st.toast(f"DEBUG: {message}", icon="πŸ›")
30
-
31
- def initialize_session_state():
32
- """Initialize all session state variables with validation"""
33
- required_keys = {
34
  'document_data': [],
35
  'qa_pairs': [],
36
- 'processing_complete': False,
37
- 'current_stage': 'idle',
38
- 'api_keys': {},
39
- 'model_choice': "Deepseek",
40
- 'temperature': 0.3,
41
- 'debug_mode': True
42
  }
43
 
44
- for key, value in required_keys.items():
45
  if key not in st.session_state:
46
- st.session_state[key] = value
47
-
48
- def show_processing_status():
49
- """Visual feedback system"""
50
- status_messages = {
51
- 'idle': "🟒 Ready to process",
52
- 'extracting': "πŸ” Extracting document content...",
53
- 'generating': "🧠 Generating Q&A pairs...",
54
- 'evaluating': "πŸ“Š Evaluating results...",
55
- 'error': "❌ Processing failed"
56
- }
57
-
58
- status = st.session_state.current_stage
59
- debug_log(f"Status update: {status}")
60
- st.sidebar.markdown(f"**System Status:** {status_messages.get(status, 'Unknown')}")
61
 
62
- def process_image(img_data, page_num, img_idx):
63
- """Robust image processing with validation"""
64
  try:
65
- img = img_data["stream"]
66
- width = int(img_data["width"])
67
- height = int(img_data["height"])
68
-
69
- debug_log(f"Processing image {img_idx} on page {page_num}")
70
-
71
- # Convert image to RGB
72
  try:
73
- return Image.frombytes("RGB", (width, height), img.get_data())
74
- except:
75
- return Image.frombytes("L", (width, height), img.get_data()).convert("RGB")
76
-
77
- except Exception as e:
78
- st.error(f"Image processing failed (Page {page_num}, Image {img_idx}): {str(e)}")
79
- return None
80
 
81
- def pdf_processing_workflow(uploaded_file):
82
- """PDF processing with real-time feedback"""
83
- st.session_state.current_stage = 'extracting'
 
84
 
85
  try:
86
  with pdfplumber.open(uploaded_file) as pdf:
87
- total_pages = len(pdf.pages)
88
- progress_bar = st.progress(0)
89
- status_text = st.empty()
90
-
91
  for page_num, page in enumerate(pdf.pages, 1):
92
- status_text.text(f"Processing page {page_num}/{total_pages}")
93
- progress_bar.progress(page_num/total_pages)
 
 
 
94
 
95
- try:
96
- text = page.extract_text() or ""
97
- images = [process_image(img, page_num, idx)
98
- for idx, img in enumerate(page.images)]
99
-
100
- st.session_state.document_data.append({
101
- "page": page_num,
102
- "text": text.strip(),
103
- "images": [img for img in images if img is not None]
104
- })
105
- except Exception as e:
106
- st.error(f"Page {page_num} error: {str(e)}")
107
 
108
- time.sleep(0.1) # Simulate processing
 
109
 
110
- progress_bar.empty()
111
- status_text.success("Document processing complete!")
112
- return True
113
-
114
  except Exception as e:
115
- st.session_state.current_stage = 'error'
116
  st.error(f"PDF processing failed: {str(e)}")
117
- debug_log(traceback.format_exc())
118
  return False
119
 
120
- def generate_qa_pairs():
121
- """Q&A generation with validation"""
122
- st.session_state.current_stage = 'generating'
123
  qa_pairs = []
124
 
125
  try:
126
  client = openai.OpenAI(
127
- base_url=SUPPORTED_MODELS[st.session_state.model_choice]["base_url"],
128
- api_key=st.secrets["DEEPSEEK_API_KEY"]
 
 
129
  )
130
 
131
- for idx, entry in enumerate(st.session_state.document_data):
132
- text_content = entry["text"] or " ".join([
133
- pytesseract.image_to_string(img) for img in entry["images"]
134
  ])
135
 
136
  response = client.chat.completions.create(
137
- model=SUPPORTED_MODELS[st.session_state.model_choice]["model"],
138
  messages=[{
139
  "role": "user",
140
- "content": f"Generate 3 Q&A pairs from:\n{text_content}\nReturn JSON format: {{'qa_pairs': [{{'question': '...', 'answer_1': '...', 'answer_2': '...'}}]}}"
141
  }],
142
- max_tokens=2048,
143
  response_format={"type": "json_object"},
144
- temperature=st.session_state.temperature
145
  )
146
 
147
  try:
148
  result = json.loads(response.choices[0].message.content)
149
- qa_pairs.extend(result.get("qa_pairs", []))
150
- debug_log(f"Generated {len(result.get('qa_pairs', []))} pairs for page {entry['page']}")
151
  except json.JSONDecodeError:
152
- st.error(f"Invalid response format from API for page {entry['page']}")
153
 
154
  st.session_state.qa_pairs = qa_pairs
155
- st.session_state.current_stage = 'evaluating'
156
  return True
157
-
158
  except Exception as e:
159
- st.session_state.current_stage = 'error'
160
- st.error(f"Q&A generation failed: {str(e)}")
161
- debug_log(traceback.format_exc())
162
  return False
163
 
164
- def main():
165
- """Main application interface"""
166
- st.set_page_config(
167
- page_title="Synthetic Data Generator",
168
- page_icon="πŸ§ͺ",
169
- layout="wide"
170
- )
171
-
172
- initialize_session_state()
173
 
174
- # Debug panel
175
- with st.sidebar:
176
- st.header("βš™οΈ Configuration")
177
- st.session_state.model_choice = st.selectbox(
178
- "AI Model", list(SUPPORTED_MODELS.keys())
179
- )
180
- st.session_state.temperature = st.slider(
181
- "Creativity Level", 0.0, 1.0, 0.3
182
- )
183
- st.session_state.debug_mode = st.checkbox("Debug Mode", True)
184
- show_processing_status()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
- st.title("πŸ§ͺ Synthetic Data Generator")
 
 
 
 
 
187
 
188
- # File upload section
189
- uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
 
 
190
 
191
- if uploaded_file and st.button("Start Processing"):
192
- if pdf_processing_workflow(uploaded_file):
193
- if generate_qa_pairs():
194
- st.success("Processing completed successfully!")
195
-
196
- # Show results
197
- st.header("Generated Q&A Pairs")
198
- for idx, pair in enumerate(st.session_state.qa_pairs[:10]):
199
- with st.expander(f"Q{idx+1}: {pair['question']}"):
200
- st.write(f"**Answer 1:** {pair['answer_1']}")
201
- st.write(f"**Answer 2:** {pair['answer_2']}")
202
-
203
- # Data export
204
- st.header("Data Export")
205
- df = pd.DataFrame(st.session_state.qa_pairs)
206
  st.download_button(
207
- label="Download as CSV",
208
- data=df.to_csv(index=False).encode('utf-8'),
209
- file_name="synthetic_data.csv",
210
- mime="text/csv"
 
211
  )
212
-
213
- # Debug information
214
- if st.session_state.debug_mode:
215
- with st.expander("Debug Information"):
216
- st.write("### Session State")
217
- st.json(st.session_state)
218
-
219
- if st.session_state.get("document_data"):
220
- st.write("### Document Data Summary")
221
- st.write(f"Pages processed: {len(st.session_state.document_data)}")
222
- st.write(f"Total images extracted: {sum(len(p['images']) for p in st.session_state.document_data)}")
223
-
224
- if st.session_state.get("qa_pairs"):
225
- st.write("### Q&A Statistics")
226
- st.write(f"Total pairs generated: {len(st.session_state.qa_pairs)}")
227
- st.write("Sample Q&A pairs:")
228
- st.table(pd.DataFrame(st.session_state.qa_pairs[:3]))
 
 
 
 
 
229
 
230
  if __name__ == "__main__":
231
- main()
 
 
 
 
 
1
  import streamlit as st
2
  import pdfplumber
3
  import pytesseract
 
 
 
4
  import openai
5
+ import json
6
  import pandas as pd
7
  import numpy as np
8
+ from PIL import Image
9
  from io import BytesIO
 
 
10
  import time
11
  import traceback
12
+ import os
13
+ import hashlib
14
 
15
  # Configuration
 
16
  SUPPORTED_MODELS = {
17
  "Deepseek": {
18
+ "base_url": "https://api.deepseek.com/v1",
19
+ "required_key": "DEEPSEEK_KEY"
20
+ },
21
+ "OpenAI": {
22
+ "base_url": "https://api.openai.com/v1",
23
+ "required_key": "OPENAI_KEY"
24
  }
25
  }
26
 
27
+ def initialize_session():
28
+ """Initialize session state with validation"""
29
+ defaults = {
30
+ 'processing_stage': 'idle',
 
 
 
 
31
  'document_data': [],
32
  'qa_pairs': [],
33
+ 'export_formats': ['JSON', 'CSV', 'Parquet'],
34
+ 'model_settings': {
35
+ 'current_model': 'Deepseek',
36
+ 'temperature': 0.3
37
+ },
38
+ 'api_keys': {}
39
  }
40
 
41
+ for key, val in defaults.items():
42
  if key not in st.session_state:
43
+ st.session_state[key] = val
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
+ def handle_image_errors(img_stream):
46
+ """Robust image processing with multiple fallbacks"""
47
  try:
48
+ # First try standard RGB conversion
49
+ return Image.frombytes("RGB", (img_stream['width'], img_stream['height']), img_stream.get_data())
50
+ except:
 
 
 
 
51
  try:
52
+ # Fallback to grayscale conversion
53
+ return Image.frombytes("L", (img_stream['width'], img_stream['height']), img_stream.get_data()).convert("RGB")
54
+ except Exception as e:
55
+ st.error(f"Critical image error: {str(e)[:200]}")
56
+ return None
 
 
57
 
58
+ def process_pdf(uploaded_file):
59
+ """PDF processing with enhanced error recovery"""
60
+ st.session_state.processing_stage = 'extracting'
61
+ doc_data = []
62
 
63
  try:
64
  with pdfplumber.open(uploaded_file) as pdf:
 
 
 
 
65
  for page_num, page in enumerate(pdf.pages, 1):
66
+ page_data = {
67
+ "page": page_num,
68
+ "text": page.extract_text() or "",
69
+ "images": []
70
+ }
71
 
72
+ # Process images with error containment
73
+ for img_idx, img in enumerate(page.images):
74
+ processed_img = handle_image_errors(img['stream'])
75
+ if processed_img:
76
+ page_data["images"].append(processed_img)
 
 
 
 
 
 
 
77
 
78
+ doc_data.append(page_data)
79
+ time.sleep(0.01) # Yield for UI updates
80
 
81
+ st.session_state.document_data = doc_data
82
+ return True
 
 
83
  except Exception as e:
 
84
  st.error(f"PDF processing failed: {str(e)}")
 
85
  return False
86
 
87
+ def generate_qa_content():
88
+ """Model-agnostic content generation"""
89
+ st.session_state.processing_stage = 'generating'
90
  qa_pairs = []
91
 
92
  try:
93
  client = openai.OpenAI(
94
+ base_url=SUPPORTED_MODELS[st.session_state.model_settings['current_model']]['base_url'],
95
+ api_key=st.session_state.api_keys.get(
96
+ SUPPORTED_MODELS[st.session_state.model_settings['current_model']]['required_key']
97
+ )
98
  )
99
 
100
+ for page in st.session_state.document_data:
101
+ text_content = page['text'] or " ".join([
102
+ pytesseract.image_to_string(img) for img in page['images']
103
  ])
104
 
105
  response = client.chat.completions.create(
106
+ model="gpt-4-turbo" if st.session_state.model_settings['current_model'] == "OpenAI" else "deepseek-chat",
107
  messages=[{
108
  "role": "user",
109
+ "content": f"Generate 3 Q&A pairs from this financial content:\n{text_content}\nOutput JSON format with keys: question, answer_1, answer_2"
110
  }],
 
111
  response_format={"type": "json_object"},
112
+ temperature=st.session_state.model_settings['temperature']
113
  )
114
 
115
  try:
116
  result = json.loads(response.choices[0].message.content)
117
+ qa_pairs.extend(result.get('qa_pairs', []))
 
118
  except json.JSONDecodeError:
119
+ st.error("Failed to parse model response")
120
 
121
  st.session_state.qa_pairs = qa_pairs
 
122
  return True
 
123
  except Exception as e:
124
+ st.error(f"Generation failed: {str(e)}")
 
 
125
  return False
126
 
127
+ def export_data():
128
+ """Multi-format export handler"""
129
+ formats = st.session_state.export_formats
130
+ timestamp = time.strftime("%Y%m%d-%H%M%S")
131
+ base_name = f"wealth_report_{timestamp}"
132
+ export_package = {}
 
 
 
133
 
134
+ try:
135
+ # JSON Export
136
+ if 'JSON' in formats:
137
+ export_package[f'{base_name}.json'] = json.dumps(
138
+ st.session_state.qa_pairs,
139
+ indent=2
140
+ ).encode()
141
+
142
+ # CSV Export
143
+ if 'CSV' in formats:
144
+ df = pd.DataFrame(st.session_state.qa_pairs)
145
+ export_package[f'{base_name}.csv'] = df.to_csv(index=False).encode()
146
+
147
+ # Parquet Export
148
+ if 'Parquet' in formats:
149
+ df = pd.DataFrame(st.session_state.qa_pairs)
150
+ buffer = BytesIO()
151
+ df.to_parquet(buffer)
152
+ export_package[f'{base_name}.parquet'] = buffer.getvalue()
153
+
154
+ return export_package
155
+ except Exception as e:
156
+ st.error(f"Export failed: {str(e)}")
157
+ return None
158
+
159
+ def api_key_manager():
160
+ """Secure API key management UI"""
161
+ with st.sidebar.expander("πŸ”‘ API Key Management", expanded=True):
162
+ for model in SUPPORTED_MODELS:
163
+ key = st.text_input(
164
+ f"{model} API Key",
165
+ type="password",
166
+ key=f"key_{model}"
167
+ )
168
+ if key:
169
+ st.session_state.api_keys[SUPPORTED_MODELS[model]['required_key']] = key
170
+
171
+ def main_interface():
172
+ """Core application interface"""
173
+ st.title("Global Wealth Report Analyzer")
174
+ st.write("Advanced financial document processing with multi-model AI support")
175
 
176
+ # File Upload
177
+ uploaded_file = st.file_uploader(
178
+ "Upload PDF Report",
179
+ type=["pdf"],
180
+ accept_multiple_files=False
181
+ )
182
 
183
+ # Processing Controls
184
+ if uploaded_file and st.button("Start Analysis"):
185
+ if process_pdf(uploaded_file) and generate_qa_content():
186
+ st.session_state.processing_stage = 'complete'
187
 
188
+ # Results Display
189
+ if st.session_state.processing_stage == 'complete':
190
+ st.success("Analysis Complete!")
191
+
192
+ # Data Export
193
+ with st.expander("πŸ“¦ Export Results", expanded=True):
194
+ cols = st.columns(3)
195
+ with cols[0]:
196
+ st.multiselect(
197
+ "Export Formats",
198
+ ['JSON', 'CSV', 'Parquet'],
199
+ default=['JSON', 'CSV'],
200
+ key='export_formats'
201
+ )
202
+ with cols[1]:
203
  st.download_button(
204
+ "Download Results",
205
+ data=export_data()['wealth_report.json'],
206
+ file_name="wealth_report.zip",
207
+ mime="application/zip",
208
+ disabled=not st.session_state.qa_pairs
209
  )
210
+
211
+ # Results Preview
212
+ with st.expander("πŸ” View Generated Content"):
213
+ st.dataframe(
214
+ pd.DataFrame(st.session_state.qa_pairs),
215
+ use_container_width=True,
216
+ height=400
217
+ )
218
+
219
+ def model_settings():
220
+ """Model configuration panel"""
221
+ with st.sidebar.expander("🧠 AI Settings", expanded=True):
222
+ st.selectbox(
223
+ "AI Model",
224
+ list(SUPPORTED_MODELS.keys()),
225
+ key='model_settings.current_model'
226
+ )
227
+ st.slider(
228
+ "Creativity Level",
229
+ 0.0, 1.0, 0.3,
230
+ key='model_settings.temperature'
231
+ )
232
 
233
  if __name__ == "__main__":
234
+ initialize_session()
235
+ api_key_manager()
236
+ model_settings()
237
+ main_interface()