mgbam commited on
Commit
218d2f0
·
verified ·
1 Parent(s): 0f3f863

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +156 -221
app.py CHANGED
@@ -17,280 +17,215 @@ import traceback
17
  # Configuration
18
  MAX_THREADS = 4
19
  SUPPORTED_MODELS = {
20
- "Deepseek": "deepseek-chat",
21
- "Llama-3-70B": "meta-llama/Meta-Llama-3-70B-Instruct",
22
- "Mixtral": "mistralai/Mixtral-8x7B-Instruct-v0.1"
 
23
  }
24
 
 
 
 
 
 
25
  def initialize_session_state():
26
- """Initialize all session state variables"""
27
- defaults = {
28
  'document_data': [],
29
  'qa_pairs': [],
30
  'processing_complete': False,
31
  'current_stage': 'idle',
32
  'api_keys': {},
33
  'model_choice': "Deepseek",
34
- 'temperature': 0.3
 
35
  }
36
 
37
- for key, value in defaults.items():
38
  if key not in st.session_state:
39
  st.session_state[key] = value
40
 
41
- def secure_api_management():
42
- """Advanced API key management with encryption"""
43
- with st.sidebar:
44
- st.header("🔑 API Management")
45
- provider = st.selectbox("Provider", list(SUPPORTED_MODELS.keys()))
46
- new_key = st.text_input(f"Enter {provider} API Key", type="password")
47
-
48
- if st.button("Store Key"):
49
- if new_key:
50
- hashed_key = hashlib.sha256(new_key.encode()).hexdigest()
51
- st.session_state.api_keys[provider] = hashed_key
52
- st.success("Key stored securely")
53
- else:
54
- st.error("Please enter a valid API key")
55
 
56
  def process_image(img_data, page_num, img_idx):
57
- """Advanced image processing with error handling"""
58
  try:
59
  img = img_data["stream"]
60
  width = int(img_data["width"])
61
  height = int(img_data["height"])
62
 
63
- # Determine color mode
64
- color_space = getattr(img, "colorspace", "")
65
- mode = "RGB"
66
- if "/DeviceCMYK" in str(color_space):
67
- mode = "CMYK"
68
- elif "/DeviceGray" in str(color_space):
69
- mode = "L"
70
-
71
  # Convert image to RGB
72
- image = Image.frombytes(mode, (width, height), img.get_data())
73
- if mode != "RGB":
74
- image = image.convert("RGB")
 
75
 
76
- return image
77
- except Exception as e:
78
- st.error(f"Image processing error (Page {page_num}, Image {img_idx}): {str(e)[:100]}")
79
- return None
80
-
81
- def process_page(page_data):
82
- """Thread-safe page processing"""
83
- page_num, page = page_data
84
- try:
85
- text = page.extract_text() or ""
86
- images = []
87
-
88
- for idx, img in enumerate(page.images):
89
- processed_image = process_image(img, page_num, idx)
90
- if processed_image:
91
- images.append(processed_image)
92
-
93
- return {"page": page_num, "text": text.strip(), "images": images}
94
  except Exception as e:
95
- st.error(f"Page {page_num} error: {str(e)[:100]}")
96
  return None
97
 
98
- def advanced_pdf_processor(uploaded_file):
99
- """Multi-threaded PDF processing with real-time updates"""
100
- st.session_state.document_data = []
101
 
102
- with ThreadPoolExecutor(max_workers=MAX_THREADS) as executor:
103
  with pdfplumber.open(uploaded_file) as pdf:
104
- future = executor.submit(
105
- lambda: list(executor.map(process_page, enumerate(pdf.pages, 1)))
106
- )
107
-
108
- while not future.done():
109
- time.sleep(0.1)
110
- st.rerun()
111
 
112
- results = future.result()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
- for result in results:
115
- if result:
116
- st.session_state.document_data.append(result)
117
- st.rerun()
118
-
119
- def hybrid_text_extraction(entry):
120
- """Multimodal text extraction with fallback"""
121
- text_content = entry["text"]
122
-
123
- if not text_content and entry["images"]:
124
- ocr_results = []
125
- for img in entry["images"]:
126
- try:
127
- ocr_results.append(pytesseract.image_to_string(img))
128
- except Exception as e:
129
- st.warning(f"OCR failed: {str(e)[:100]}")
130
- text_content = " ".join(ocr_results).strip()
131
-
132
- return text_content
133
 
134
- def generate_with_retry(model, messages):
135
- """Enterprise-grade LLM generation with retry logic"""
136
- client = openai.OpenAI(
137
- base_url="https://api.deepseek.com/v1",
138
- api_key=st.secrets.get("DEEPSEEK_API_KEY")
139
- )
140
 
141
- for attempt in range(3):
142
- try:
 
 
 
 
 
 
 
 
 
143
  response = client.chat.completions.create(
144
- model=SUPPORTED_MODELS[model],
145
- messages=messages,
 
 
 
146
  max_tokens=2048,
147
  response_format={"type": "json_object"},
148
  temperature=st.session_state.temperature
149
  )
150
- return json.loads(response.choices[0].message.content)
151
- except Exception as e:
152
- if attempt == 2:
153
- raise
154
- time.sleep(2 ** attempt)
155
-
156
- def qa_generation_workflow():
157
- """Enterprise Q&A generation pipeline"""
158
- with st.status("🚀 AI Processing Pipeline", expanded=True) as status:
159
- try:
160
- st.write("Initializing neural processors...")
161
- total_pages = len(st.session_state.document_data)
162
- qa_pairs = []
163
-
164
- for idx, entry in enumerate(st.session_state.document_data):
165
- status.write(f"Processing page {idx+1}/{total_pages}")
166
- text_content = hybrid_text_extraction(entry)
167
-
168
- prompt = f"""Generate 3 sophisticated Q&A pairs from:
169
- Page {entry['page']} Content:
170
- {text_content}
171
-
172
- Return JSON format: {{"qa_pairs": [{{"question": "...", "answer_1": "...", "answer_2": "..."}}]}}"""
173
-
174
- response = generate_with_retry(
175
- st.session_state.model_choice,
176
- [{"role": "user", "content": prompt}]
177
- )
178
- qa_pairs.extend(response.get("qa_pairs", []))
179
 
180
- st.session_state.qa_pairs = qa_pairs
181
- status.update(label="Processing complete ✅", state="complete")
182
- except Exception as e:
183
- status.error(f"Processing failed: {traceback.format_exc()[:500]}")
184
- st.session_state.processing_complete = False
185
-
186
- def evaluation_interface():
187
- """Interactive quality control center"""
188
- st.header("🧪 Quality Control Hub")
189
-
190
- with st.expander("Automated AI Evaluation", expanded=True):
191
- if st.button("Run Batch Validation"):
192
- with st.spinner("Validating responses..."):
193
- time.sleep(2) # Simulated validation
194
- st.success("Quality check passed: 98% accuracy")
195
-
196
- with st.expander("Human-in-the-Loop Review"):
197
- sample_size = min(5, len(st.session_state.qa_pairs))
198
- for idx in range(sample_size):
199
- pair = st.session_state.qa_pairs[idx]
200
- with st.container(border=True):
201
- col1, col2 = st.columns([1, 3])
202
- with col1:
203
- st.metric("Page", pair["page"])
204
- with col2:
205
- st.write(f"**Question:** {pair['question']}")
206
-
207
- tab1, tab2 = st.tabs(["Answer 1", "Answer 2"])
208
- with tab1:
209
- st.write(pair["answer_1"])
210
- with tab2:
211
- st.write(pair["answer_2"])
212
 
213
- st.selectbox(
214
- "Select preferred answer",
215
- ["Answer 1", "Answer 2", "Needs Review"],
216
- key=f"eval_{idx}"
217
- )
218
-
219
- def data_export_module():
220
- """Enterprise-grade data export system"""
221
- st.header("📦 Data Packaging")
222
-
223
- col1, col2, col3 = st.columns(3)
224
- with col1:
225
- export_format = st.selectbox("Format", ["JSON", "CSV", "Parquet"])
226
- with col2:
227
- compression = st.selectbox("Compression", ["None", "gzip", "zip"])
228
- with col3:
229
- include_metadata = st.checkbox("Include Metadata", True)
230
-
231
- if st.button("Generate Export Package"):
232
- with st.spinner("Packaging data..."):
233
- df = pd.DataFrame(st.session_state.qa_pairs)
234
- buffer = BytesIO()
235
-
236
- if export_format == "JSON":
237
- df.to_json(buffer, orient="records", indent=2)
238
- mime = "application/json"
239
- elif export_format == "CSV":
240
- df.to_csv(buffer, index=False)
241
- mime = "text/csv"
242
- else:
243
- df.to_parquet(buffer, compression=compression if compression != "None" else None)
244
- mime = "application/octet-stream"
245
-
246
- st.download_button(
247
- label="Download Dataset",
248
- data=buffer.getvalue(),
249
- file_name=f"synthetic_data_{int(time.time())}.{export_format.lower()}",
250
- mime=mime
251
- )
252
-
253
- def main_interface():
254
- """Core application interface"""
255
- st.title("🏭 Synthetic Data Factory")
256
- st.write("Industrial-scale synthetic data generation powered by cutting-edge AI")
257
-
258
- # Processing pipeline
259
- if uploaded_file := st.sidebar.file_uploader("Upload PDF Document", type=["pdf"]):
260
- if st.sidebar.button("Start Generation"):
261
- st.session_state.processing_complete = False
262
- advanced_pdf_processor(uploaded_file)
263
- qa_generation_workflow()
264
- st.session_state.processing_complete = True
265
-
266
- # Display results
267
- if st.session_state.processing_complete:
268
- evaluation_interface()
269
- data_export_module()
270
 
271
  def main():
272
- """Main application entry point"""
273
  st.set_page_config(
274
- page_title="Synthetic Data Factory",
275
- page_icon="🏭",
276
  layout="wide"
277
  )
278
 
279
  initialize_session_state()
280
- secure_api_management()
281
 
 
282
  with st.sidebar:
283
- st.header("⚙️ Engine Configuration")
284
  st.session_state.model_choice = st.selectbox(
285
- "AI Model",
286
- list(SUPPORTED_MODELS.keys())
287
  )
288
  st.session_state.temperature = st.slider(
289
- "Creativity Level",
290
- 0.0, 1.0, 0.3
291
  )
 
 
292
 
293
- main_interface()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  if __name__ == "__main__":
296
  main()
 
17
  # Configuration
18
  MAX_THREADS = 4
19
  SUPPORTED_MODELS = {
20
+ "Deepseek": {
21
+ "model": "deepseek-chat",
22
+ "base_url": "https://api.deepseek.com/v1"
23
+ }
24
  }
25
 
26
+ def debug_log(message):
27
+ """Enhanced logging system"""
28
+ if st.session_state.get("debug_mode"):
29
+ st.toast(f"DEBUG: {message}", icon="🐛")
30
+
31
  def initialize_session_state():
32
+ """Initialize all session state variables with validation"""
33
+ required_keys = {
34
  'document_data': [],
35
  'qa_pairs': [],
36
  'processing_complete': False,
37
  'current_stage': 'idle',
38
  'api_keys': {},
39
  'model_choice': "Deepseek",
40
+ 'temperature': 0.3,
41
+ 'debug_mode': True
42
  }
43
 
44
+ for key, value in required_keys.items():
45
  if key not in st.session_state:
46
  st.session_state[key] = value
47
 
48
+ def show_processing_status():
49
+ """Visual feedback system"""
50
+ status_messages = {
51
+ 'idle': "🟢 Ready to process",
52
+ 'extracting': "🔍 Extracting document content...",
53
+ 'generating': "🧠 Generating Q&A pairs...",
54
+ 'evaluating': "📊 Evaluating results...",
55
+ 'error': " Processing failed"
56
+ }
57
+
58
+ status = st.session_state.current_stage
59
+ debug_log(f"Status update: {status}")
60
+ st.sidebar.markdown(f"**System Status:** {status_messages.get(status, 'Unknown')}")
 
61
 
62
  def process_image(img_data, page_num, img_idx):
63
+ """Robust image processing with validation"""
64
  try:
65
  img = img_data["stream"]
66
  width = int(img_data["width"])
67
  height = int(img_data["height"])
68
 
69
+ debug_log(f"Processing image {img_idx} on page {page_num}")
70
+
 
 
 
 
 
 
71
  # Convert image to RGB
72
+ try:
73
+ return Image.frombytes("RGB", (width, height), img.get_data())
74
+ except:
75
+ return Image.frombytes("L", (width, height), img.get_data()).convert("RGB")
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  except Exception as e:
78
+ st.error(f"Image processing failed (Page {page_num}, Image {img_idx}): {str(e)}")
79
  return None
80
 
81
+ def pdf_processing_workflow(uploaded_file):
82
+ """PDF processing with real-time feedback"""
83
+ st.session_state.current_stage = 'extracting'
84
 
85
+ try:
86
  with pdfplumber.open(uploaded_file) as pdf:
87
+ total_pages = len(pdf.pages)
88
+ progress_bar = st.progress(0)
89
+ status_text = st.empty()
 
 
 
 
90
 
91
+ for page_num, page in enumerate(pdf.pages, 1):
92
+ status_text.text(f"Processing page {page_num}/{total_pages}")
93
+ progress_bar.progress(page_num/total_pages)
94
+
95
+ try:
96
+ text = page.extract_text() or ""
97
+ images = [process_image(img, page_num, idx)
98
+ for idx, img in enumerate(page.images)]
99
+
100
+ st.session_state.document_data.append({
101
+ "page": page_num,
102
+ "text": text.strip(),
103
+ "images": [img for img in images if img is not None]
104
+ })
105
+ except Exception as e:
106
+ st.error(f"Page {page_num} error: {str(e)}")
107
+
108
+ time.sleep(0.1) # Simulate processing
109
+
110
+ progress_bar.empty()
111
+ status_text.success("Document processing complete!")
112
+ return True
113
 
114
+ except Exception as e:
115
+ st.session_state.current_stage = 'error'
116
+ st.error(f"PDF processing failed: {str(e)}")
117
+ debug_log(traceback.format_exc())
118
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ def generate_qa_pairs():
121
+ """Q&A generation with validation"""
122
+ st.session_state.current_stage = 'generating'
123
+ qa_pairs = []
 
 
124
 
125
+ try:
126
+ client = openai.OpenAI(
127
+ base_url=SUPPORTED_MODELS[st.session_state.model_choice]["base_url"],
128
+ api_key=st.secrets["DEEPSEEK_API_KEY"]
129
+ )
130
+
131
+ for idx, entry in enumerate(st.session_state.document_data):
132
+ text_content = entry["text"] or " ".join([
133
+ pytesseract.image_to_string(img) for img in entry["images"]
134
+ ])
135
+
136
  response = client.chat.completions.create(
137
+ model=SUPPORTED_MODELS[st.session_state.model_choice]["model"],
138
+ messages=[{
139
+ "role": "user",
140
+ "content": f"Generate 3 Q&A pairs from:\n{text_content}\nReturn JSON format: {{'qa_pairs': [{{'question': '...', 'answer_1': '...', 'answer_2': '...'}}]}}"
141
+ }],
142
  max_tokens=2048,
143
  response_format={"type": "json_object"},
144
  temperature=st.session_state.temperature
145
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
+ try:
148
+ result = json.loads(response.choices[0].message.content)
149
+ qa_pairs.extend(result.get("qa_pairs", []))
150
+ debug_log(f"Generated {len(result.get('qa_pairs', []))} pairs for page {entry['page']}")
151
+ except json.JSONDecodeError:
152
+ st.error(f"Invalid response format from API for page {entry['page']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ st.session_state.qa_pairs = qa_pairs
155
+ st.session_state.current_stage = 'evaluating'
156
+ return True
157
+
158
+ except Exception as e:
159
+ st.session_state.current_stage = 'error'
160
+ st.error(f"Q&A generation failed: {str(e)}")
161
+ debug_log(traceback.format_exc())
162
+ return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
 
164
  def main():
165
+ """Main application interface"""
166
  st.set_page_config(
167
+ page_title="Synthetic Data Generator",
168
+ page_icon="🧪",
169
  layout="wide"
170
  )
171
 
172
  initialize_session_state()
 
173
 
174
+ # Debug panel
175
  with st.sidebar:
176
+ st.header("⚙️ Configuration")
177
  st.session_state.model_choice = st.selectbox(
178
+ "AI Model", list(SUPPORTED_MODELS.keys())
 
179
  )
180
  st.session_state.temperature = st.slider(
181
+ "Creativity Level", 0.0, 1.0, 0.3
 
182
  )
183
+ st.session_state.debug_mode = st.checkbox("Debug Mode", True)
184
+ show_processing_status()
185
 
186
+ st.title("🧪 Synthetic Data Generator")
187
+
188
+ # File upload section
189
+ uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
190
+
191
+ if uploaded_file and st.button("Start Processing"):
192
+ if pdf_processing_workflow(uploaded_file):
193
+ if generate_qa_pairs():
194
+ st.success("Processing completed successfully!")
195
+
196
+ # Show results
197
+ st.header("Generated Q&A Pairs")
198
+ for idx, pair in enumerate(st.session_state.qa_pairs[:10]):
199
+ with st.expander(f"Q{idx+1}: {pair['question']}"):
200
+ st.write(f"**Answer 1:** {pair['answer_1']}")
201
+ st.write(f"**Answer 2:** {pair['answer_2']}")
202
+
203
+ # Data export
204
+ st.header("Data Export")
205
+ df = pd.DataFrame(st.session_state.qa_pairs)
206
+ st.download_button(
207
+ label="Download as CSV",
208
+ data=df.to_csv(index=False).encode('utf-8'),
209
+ file_name="synthetic_data.csv",
210
+ mime="text/csv"
211
+ )
212
+
213
+ # Debug information
214
+ if st.session_state.debug_mode:
215
+ with st.expander("Debug Information"):
216
+ st.write("### Session State")
217
+ st.json(st.session_state)
218
+
219
+ if st.session_state.get("document_data"):
220
+ st.write("### Document Data Summary")
221
+ st.write(f"Pages processed: {len(st.session_state.document_data)}")
222
+ st.write(f"Total images extracted: {sum(len(p['images']) for p in st.session_state.document_data)}")
223
+
224
+ if st.session_state.get("qa_pairs"):
225
+ st.write("### Q&A Statistics")
226
+ st.write(f"Total pairs generated: {len(st.session_state.qa_pairs)}")
227
+ st.write("Sample Q&A pairs:")
228
+ st.table(pd.DataFrame(st.session_state.qa_pairs[:3]))
229
 
230
  if __name__ == "__main__":
231
  main()