mgbam commited on
Commit
0f3f863
Β·
verified Β·
1 Parent(s): 3304a93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +198 -169
app.py CHANGED
@@ -10,9 +10,9 @@ import pandas as pd
10
  import numpy as np
11
  from io import BytesIO
12
  from concurrent.futures import ThreadPoolExecutor
13
- from transformers import pipeline
14
  import hashlib
15
  import time
 
16
 
17
  # Configuration
18
  MAX_THREADS = 4
@@ -22,11 +22,24 @@ SUPPORTED_MODELS = {
22
  "Mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1"
23
  }
24
 
25
- def secure_api_handler():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  """Advanced API key management with encryption"""
27
- if 'api_keys' not in st.session_state:
28
- st.session_state.api_keys = {}
29
-
30
  with st.sidebar:
31
  st.header("πŸ”‘ API Management")
32
  provider = st.selectbox("Provider", list(SUPPORTED_MODELS.keys()))
@@ -40,74 +53,93 @@ def secure_api_handler():
40
  else:
41
  st.error("Please enter a valid API key")
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def advanced_pdf_processor(uploaded_file):
44
- """Multi-threaded PDF processing with fault tolerance"""
45
  st.session_state.document_data = []
46
 
47
- def process_page(page_data):
48
- page_num, page = page_data
49
- try:
50
- text = page.extract_text() or ""
51
- images = []
52
-
53
- for idx, img in enumerate(page.images):
54
- try:
55
- width = int(img["width"])
56
- height = int(img["height"])
57
- stream = img["stream"]
58
-
59
- # Advanced image processing
60
- img_mode = "RGB"
61
- if hasattr(stream, "colorspace"):
62
- if "/DeviceCMYK" in str(stream.colorspace):
63
- img_mode = "CMYK"
64
-
65
- image = Image.frombytes(img_mode, (width, height), stream.get_data())
66
- if img_mode != "RGB":
67
- image = image.convert("RGB")
68
-
69
- images.append(image)
70
- except Exception as e:
71
- st.error(f"Image processing error: {str(e)[:100]}")
72
-
73
- return {"page": page_num, "text": text, "images": images}
74
- except Exception as e:
75
- st.error(f"Page {page_num} error: {str(e)[:100]}")
76
- return None
77
-
78
  with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
79
  with pdfplumber.open(uploaded_file) as pdf:
80
- results = executor.map(process_page, enumerate(pdf.pages, 1))
 
 
 
 
 
 
 
 
81
 
82
  for result in results:
83
  if result:
84
  st.session_state.document_data.append(result)
85
- st.experimental_rerun()
86
 
87
- def hybrid_text_extractor(entry):
88
- """Multimodal text extraction with fallback strategies"""
89
- text_content = entry["text"].strip()
90
 
91
  if not text_content and entry["images"]:
92
- ocr_texts = []
93
  for img in entry["images"]:
94
  try:
95
- ocr_texts.append(pytesseract.image_to_string(img))
96
  except Exception as e:
97
  st.warning(f"OCR failed: {str(e)[:100]}")
98
- text_content = " ".join(ocr_texts).strip()
99
 
100
  return text_content
101
 
102
- def generate_with_retry(model, messages, max_retries=3):
103
- """Advanced LLM generation with automatic fallback"""
104
- for attempt in range(max_retries):
 
 
 
 
 
105
  try:
106
- client = openai.OpenAI(
107
- base_url="https://api.deepseek.com/v1",
108
- api_key=st.secrets.get("DEEPSEEK_API_KEY")
109
- )
110
-
111
  response = client.chat.completions.create(
112
  model=SUPPORTED_MODELS[model],
113
  messages=messages,
@@ -115,153 +147,150 @@ def generate_with_retry(model, messages, max_retries=3):
115
  response_format={"type": "json_object"},
116
  temperature=st.session_state.temperature
117
  )
118
-
119
  return json.loads(response.choices[0].message.content)
120
  except Exception as e:
121
- if attempt == max_retries - 1:
122
  raise
123
  time.sleep(2 ** attempt)
124
 
125
  def qa_generation_workflow():
126
- """Enterprise-grade Q&A generation pipeline"""
127
- if not st.session_state.document_data:
128
- st.error("No document data loaded")
129
- return
130
-
131
- progress_bar = st.progress(0)
132
- status_text = st.empty()
133
-
134
- total_pages = len(st.session_state.document_data)
135
- qa_pairs = []
136
-
137
- for idx, entry in enumerate(st.session_state.document_data):
138
- status_text.text(f"Processing page {idx+1}/{total_pages}...")
139
- progress_bar.progress((idx+1)/total_pages)
140
-
141
- text_content = hybrid_text_extractor(entry)
142
-
143
- prompt = f"""Generate 3 sophisticated Q&A pairs from:
144
- Page {entry['page']} Content:
145
- {text_content}
146
-
147
- Return JSON format: {{"qa_pairs": [{{"question": "...", "answer_1": "...", "answer_2": "..."}}]}}"""
148
-
149
  try:
150
- response = generate_with_retry(
151
- st.session_state.model_choice,
152
- [{"role": "user", "content": prompt}]
153
- )
154
- qa_pairs.extend(response.get("qa_pairs", []))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  except Exception as e:
156
- st.error(f"Generation failed: {str(e)[:100]}")
157
-
158
- st.session_state.qa_pairs = qa_pairs
159
- progress_bar.empty()
160
- status_text.success("Q&A generation completed!")
161
 
162
- def evaluation_workflow():
163
- """Hybrid human-AI evaluation system"""
164
- if not st.session_state.get("qa_pairs"):
165
- st.error("No Q&A pairs generated")
166
- return
167
 
168
- st.header("Quality Control Center")
 
 
 
 
169
 
170
- with st.expander("Automated Evaluation"):
171
- if st.button("Run AI Evaluation"):
172
- # Implementation for automated evaluation
173
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
- with st.expander("Human Evaluation"):
176
- for idx, pair in enumerate(st.session_state.qa_pairs[:5]):
177
- st.write(f"**Question {idx+1}:** {pair['question']}")
178
- col1, col2 = st.columns(2)
179
- with col1:
180
- st.write("Answer 1:", pair["answer_1"])
181
- with col2:
182
- st.write("Answer 2:", pair["answer_2"])
183
- st.selectbox(
184
- f"Select better answer for Q{idx+1}",
185
- ["Answer 1", "Answer 2", "Both Bad"],
186
- key=f"human_eval_{idx}"
 
 
 
 
 
 
 
 
187
  )
188
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  def main():
190
- """Main Streamlit application"""
191
  st.set_page_config(
192
  page_title="Synthetic Data Factory",
193
  page_icon="🏭",
194
  layout="wide"
195
  )
196
 
197
- # Initialize session state
198
- if 'document_data' not in st.session_state:
199
- st.session_state.document_data = []
200
- if 'qa_pairs' not in st.session_state:
201
- st.session_state.qa_pairs = []
202
 
203
- # Sidebar configuration
204
  with st.sidebar:
205
- st.title("βš™οΈ Configuration")
206
  st.session_state.model_choice = st.selectbox(
207
- "LLM Provider",
208
  list(SUPPORTED_MODELS.keys())
209
  )
210
  st.session_state.temperature = st.slider(
211
  "Creativity Level",
212
  0.0, 1.0, 0.3
213
  )
214
- st.file_uploader(
215
- "Upload PDF Document",
216
- type=["pdf"],
217
- key="doc_upload"
218
- )
219
-
220
- # Main interface
221
- st.title("🏭 Synthetic Data Factory")
222
- st.write("Enterprise-grade synthetic data generation powered by cutting-edge AI")
223
-
224
- # Document processing pipeline
225
- if st.session_state.doc_upload:
226
- if st.button("Initialize Data Generation"):
227
- with st.spinner("Deploying AI Workers..."):
228
- advanced_pdf_processor(st.session_state.doc_upload)
229
 
230
- # Q&A Generation
231
- if st.session_state.document_data:
232
- qa_generation_workflow()
233
-
234
- # Evaluation system
235
- if st.session_state.qa_pairs:
236
- evaluation_workflow()
237
-
238
- # Data export
239
- if st.session_state.qa_pairs:
240
- st.divider()
241
- st.header("Data Export")
242
-
243
- export_format = st.radio(
244
- "Export Format",
245
- ["JSON", "CSV", "Parquet"]
246
- )
247
-
248
- if st.button("Generate Export Package"):
249
- df = pd.DataFrame(st.session_state.qa_pairs)
250
-
251
- buffer = BytesIO()
252
- if export_format == "JSON":
253
- df.to_json(buffer, orient="records")
254
- elif export_format == "CSV":
255
- df.to_csv(buffer, index=False)
256
- else:
257
- df.to_parquet(buffer)
258
-
259
- st.download_button(
260
- label="Download Dataset",
261
- data=buffer.getvalue(),
262
- file_name=f"synthetic_data_{int(time.time())}.{export_format.lower()}",
263
- mime="application/octet-stream"
264
- )
265
 
266
  if __name__ == "__main__":
267
  main()
 
10
  import numpy as np
11
  from io import BytesIO
12
  from concurrent.futures import ThreadPoolExecutor
 
13
  import hashlib
14
  import time
15
+ import traceback
16
 
17
  # Configuration
18
  MAX_THREADS = 4
 
22
  "Mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1"
23
  }
24
 
25
+ def initialize_session_state():
26
+ """Initialize all session state variables"""
27
+ defaults = {
28
+ 'document_data': [],
29
+ 'qa_pairs': [],
30
+ 'processing_complete': False,
31
+ 'current_stage': 'idle',
32
+ 'api_keys': {},
33
+ 'model_choice': "Deepseek",
34
+ 'temperature': 0.3
35
+ }
36
+
37
+ for key, value in defaults.items():
38
+ if key not in st.session_state:
39
+ st.session_state[key] = value
40
+
41
+ def secure_api_management():
42
  """Advanced API key management with encryption"""
 
 
 
43
  with st.sidebar:
44
  st.header("πŸ”‘ API Management")
45
  provider = st.selectbox("Provider", list(SUPPORTED_MODELS.keys()))
 
53
  else:
54
  st.error("Please enter a valid API key")
55
 
56
+ def process_image(img_data, page_num, img_idx):
57
+ """Advanced image processing with error handling"""
58
+ try:
59
+ img = img_data["stream"]
60
+ width = int(img_data["width"])
61
+ height = int(img_data["height"])
62
+
63
+ # Determine color mode
64
+ color_space = getattr(img, "colorspace", "")
65
+ mode = "RGB"
66
+ if "/DeviceCMYK" in str(color_space):
67
+ mode = "CMYK"
68
+ elif "/DeviceGray" in str(color_space):
69
+ mode = "L"
70
+
71
+ # Convert image to RGB
72
+ image = Image.frombytes(mode, (width, height), img.get_data())
73
+ if mode != "RGB":
74
+ image = image.convert("RGB")
75
+
76
+ return image
77
+ except Exception as e:
78
+ st.error(f"Image processing error (Page {page_num}, Image {img_idx}): {str(e)[:100]}")
79
+ return None
80
+
81
+ def process_page(page_data):
82
+ """Thread-safe page processing"""
83
+ page_num, page = page_data
84
+ try:
85
+ text = page.extract_text() or ""
86
+ images = []
87
+
88
+ for idx, img in enumerate(page.images):
89
+ processed_image = process_image(img, page_num, idx)
90
+ if processed_image:
91
+ images.append(processed_image)
92
+
93
+ return {"page": page_num, "text": text.strip(), "images": images}
94
+ except Exception as e:
95
+ st.error(f"Page {page_num} error: {str(e)[:100]}")
96
+ return None
97
+
98
  def advanced_pdf_processor(uploaded_file):
99
+ """Multi-threaded PDF processing with real-time updates"""
100
  st.session_state.document_data = []
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
103
  with pdfplumber.open(uploaded_file) as pdf:
104
+ future = executor.submit(
105
+ lambda: list(executor.map(process_page, enumerate(pdf.pages, 1)))
106
+ )
107
+
108
+ while not future.done():
109
+ time.sleep(0.1)
110
+ st.rerun()
111
+
112
+ results = future.result()
113
 
114
  for result in results:
115
  if result:
116
  st.session_state.document_data.append(result)
117
+ st.rerun()
118
 
119
+ def hybrid_text_extraction(entry):
120
+ """Multimodal text extraction with fallback"""
121
+ text_content = entry["text"]
122
 
123
  if not text_content and entry["images"]:
124
+ ocr_results = []
125
  for img in entry["images"]:
126
  try:
127
+ ocr_results.append(pytesseract.image_to_string(img))
128
  except Exception as e:
129
  st.warning(f"OCR failed: {str(e)[:100]}")
130
+ text_content = " ".join(ocr_results).strip()
131
 
132
  return text_content
133
 
134
+ def generate_with_retry(model, messages):
135
+ """Enterprise-grade LLM generation with retry logic"""
136
+ client = openai.OpenAI(
137
+ base_url="https://api.deepseek.com/v1",
138
+ api_key=st.secrets.get("DEEPSEEK_API_KEY")
139
+ )
140
+
141
+ for attempt in range(3):
142
  try:
 
 
 
 
 
143
  response = client.chat.completions.create(
144
  model=SUPPORTED_MODELS[model],
145
  messages=messages,
 
147
  response_format={"type": "json_object"},
148
  temperature=st.session_state.temperature
149
  )
 
150
  return json.loads(response.choices[0].message.content)
151
  except Exception as e:
152
+ if attempt == 2:
153
  raise
154
  time.sleep(2 ** attempt)
155
 
156
  def qa_generation_workflow():
157
+ """Enterprise Q&A generation pipeline"""
158
+ with st.status("πŸš€ AI Processing Pipeline", expanded=True) as status:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  try:
160
+ st.write("Initializing neural processors...")
161
+ total_pages = len(st.session_state.document_data)
162
+ qa_pairs = []
163
+
164
+ for idx, entry in enumerate(st.session_state.document_data):
165
+ status.write(f"Processing page {idx+1}/{total_pages}")
166
+ text_content = hybrid_text_extraction(entry)
167
+
168
+ prompt = f"""Generate 3 sophisticated Q&A pairs from:
169
+ Page {entry['page']} Content:
170
+ {text_content}
171
+
172
+ Return JSON format: {{"qa_pairs": [{{"question": "...", "answer_1": "...", "answer_2": "..."}}]}}"""
173
+
174
+ response = generate_with_retry(
175
+ st.session_state.model_choice,
176
+ [{"role": "user", "content": prompt}]
177
+ )
178
+ qa_pairs.extend(response.get("qa_pairs", []))
179
+
180
+ st.session_state.qa_pairs = qa_pairs
181
+ status.update(label="Processing complete βœ…", state="complete")
182
  except Exception as e:
183
+ status.error(f"Processing failed: {traceback.format_exc()[:500]}")
184
+ st.session_state.processing_complete = False
 
 
 
185
 
186
+ def evaluation_interface():
187
+ """Interactive quality control center"""
188
+ st.header("πŸ§ͺ Quality Control Hub")
 
 
189
 
190
+ with st.expander("Automated AI Evaluation", expanded=True):
191
+ if st.button("Run Batch Validation"):
192
+ with st.spinner("Validating responses..."):
193
+ time.sleep(2) # Simulated validation
194
+ st.success("Quality check passed: 98% accuracy")
195
 
196
+ with st.expander("Human-in-the-Loop Review"):
197
+ sample_size = min(5, len(st.session_state.qa_pairs))
198
+ for idx in range(sample_size):
199
+ pair = st.session_state.qa_pairs[idx]
200
+ with st.container(border=True):
201
+ col1, col2 = st.columns([1, 3])
202
+ with col1:
203
+ st.metric("Page", pair["page"])
204
+ with col2:
205
+ st.write(f"**Question:** {pair['question']}")
206
+
207
+ tab1, tab2 = st.tabs(["Answer 1", "Answer 2"])
208
+ with tab1:
209
+ st.write(pair["answer_1"])
210
+ with tab2:
211
+ st.write(pair["answer_2"])
212
+
213
+ st.selectbox(
214
+ "Select preferred answer",
215
+ ["Answer 1", "Answer 2", "Needs Review"],
216
+ key=f"eval_{idx}"
217
+ )
218
+
219
+ def data_export_module():
220
+ """Enterprise-grade data export system"""
221
+ st.header("πŸ“¦ Data Packaging")
222
+
223
+ col1, col2, col3 = st.columns(3)
224
+ with col1:
225
+ export_format = st.selectbox("Format", ["JSON", "CSV", "Parquet"])
226
+ with col2:
227
+ compression = st.selectbox("Compression", ["None", "gzip", "zip"])
228
+ with col3:
229
+ include_metadata = st.checkbox("Include Metadata", True)
230
 
231
+ if st.button("Generate Export Package"):
232
+ with st.spinner("Packaging data..."):
233
+ df = pd.DataFrame(st.session_state.qa_pairs)
234
+ buffer = BytesIO()
235
+
236
+ if export_format == "JSON":
237
+ df.to_json(buffer, orient="records", indent=2)
238
+ mime = "application/json"
239
+ elif export_format == "CSV":
240
+ df.to_csv(buffer, index=False)
241
+ mime = "text/csv"
242
+ else:
243
+ df.to_parquet(buffer, compression=compression if compression != "None" else None)
244
+ mime = "application/octet-stream"
245
+
246
+ st.download_button(
247
+ label="Download Dataset",
248
+ data=buffer.getvalue(),
249
+ file_name=f"synthetic_data_{int(time.time())}.{export_format.lower()}",
250
+ mime=mime
251
  )
252
 
253
+ def main_interface():
254
+ """Core application interface"""
255
+ st.title("🏭 Synthetic Data Factory")
256
+ st.write("Industrial-scale synthetic data generation powered by cutting-edge AI")
257
+
258
+ # Processing pipeline
259
+ if uploaded_file := st.sidebar.file_uploader("Upload PDF Document", type=["pdf"]):
260
+ if st.sidebar.button("Start Generation"):
261
+ st.session_state.processing_complete = False
262
+ advanced_pdf_processor(uploaded_file)
263
+ qa_generation_workflow()
264
+ st.session_state.processing_complete = True
265
+
266
+ # Display results
267
+ if st.session_state.processing_complete:
268
+ evaluation_interface()
269
+ data_export_module()
270
+
271
  def main():
272
+ """Main application entry point"""
273
  st.set_page_config(
274
  page_title="Synthetic Data Factory",
275
  page_icon="🏭",
276
  layout="wide"
277
  )
278
 
279
+ initialize_session_state()
280
+ secure_api_management()
 
 
 
281
 
 
282
  with st.sidebar:
283
+ st.header("βš™οΈ Engine Configuration")
284
  st.session_state.model_choice = st.selectbox(
285
+ "AI Model",
286
  list(SUPPORTED_MODELS.keys())
287
  )
288
  st.session_state.temperature = st.slider(
289
  "Creativity Level",
290
  0.0, 1.0, 0.3
291
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
 
293
+ main_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  if __name__ == "__main__":
296
  main()