Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -17,280 +17,215 @@ import traceback
|
|
17 |
# Configuration
|
18 |
MAX_THREADS = 4
|
19 |
SUPPORTED_MODELS = {
|
20 |
-
"Deepseek":
|
21 |
-
|
22 |
-
|
|
|
23 |
}
|
24 |
|
|
|
|
|
|
|
|
|
|
|
25 |
def initialize_session_state():
|
26 |
-
"""Initialize all session state variables"""
|
27 |
-
|
28 |
'document_data': [],
|
29 |
'qa_pairs': [],
|
30 |
'processing_complete': False,
|
31 |
'current_stage': 'idle',
|
32 |
'api_keys': {},
|
33 |
'model_choice': "Deepseek",
|
34 |
-
'temperature': 0.3
|
|
|
35 |
}
|
36 |
|
37 |
-
for key, value in
|
38 |
if key not in st.session_state:
|
39 |
st.session_state[key] = value
|
40 |
|
41 |
-
def
|
42 |
-
"""
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
st.error("Please enter a valid API key")
|
55 |
|
56 |
def process_image(img_data, page_num, img_idx):
|
57 |
-
"""
|
58 |
try:
|
59 |
img = img_data["stream"]
|
60 |
width = int(img_data["width"])
|
61 |
height = int(img_data["height"])
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
mode = "RGB"
|
66 |
-
if "/DeviceCMYK" in str(color_space):
|
67 |
-
mode = "CMYK"
|
68 |
-
elif "/DeviceGray" in str(color_space):
|
69 |
-
mode = "L"
|
70 |
-
|
71 |
# Convert image to RGB
|
72 |
-
|
73 |
-
|
74 |
-
|
|
|
75 |
|
76 |
-
return image
|
77 |
-
except Exception as e:
|
78 |
-
st.error(f"Image processing error (Page {page_num}, Image {img_idx}): {str(e)[:100]}")
|
79 |
-
return None
|
80 |
-
|
81 |
-
def process_page(page_data):
|
82 |
-
"""Thread-safe page processing"""
|
83 |
-
page_num, page = page_data
|
84 |
-
try:
|
85 |
-
text = page.extract_text() or ""
|
86 |
-
images = []
|
87 |
-
|
88 |
-
for idx, img in enumerate(page.images):
|
89 |
-
processed_image = process_image(img, page_num, idx)
|
90 |
-
if processed_image:
|
91 |
-
images.append(processed_image)
|
92 |
-
|
93 |
-
return {"page": page_num, "text": text.strip(), "images": images}
|
94 |
except Exception as e:
|
95 |
-
st.error(f"Page {page_num}
|
96 |
return None
|
97 |
|
98 |
-
def
|
99 |
-
"""
|
100 |
-
st.session_state.
|
101 |
|
102 |
-
|
103 |
with pdfplumber.open(uploaded_file) as pdf:
|
104 |
-
|
105 |
-
|
106 |
-
)
|
107 |
-
|
108 |
-
while not future.done():
|
109 |
-
time.sleep(0.1)
|
110 |
-
st.rerun()
|
111 |
|
112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
113 |
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
def hybrid_text_extraction(entry):
|
120 |
-
"""Multimodal text extraction with fallback"""
|
121 |
-
text_content = entry["text"]
|
122 |
-
|
123 |
-
if not text_content and entry["images"]:
|
124 |
-
ocr_results = []
|
125 |
-
for img in entry["images"]:
|
126 |
-
try:
|
127 |
-
ocr_results.append(pytesseract.image_to_string(img))
|
128 |
-
except Exception as e:
|
129 |
-
st.warning(f"OCR failed: {str(e)[:100]}")
|
130 |
-
text_content = " ".join(ocr_results).strip()
|
131 |
-
|
132 |
-
return text_content
|
133 |
|
134 |
-
def
|
135 |
-
"""
|
136 |
-
|
137 |
-
|
138 |
-
api_key=st.secrets.get("DEEPSEEK_API_KEY")
|
139 |
-
)
|
140 |
|
141 |
-
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
response = client.chat.completions.create(
|
144 |
-
model=SUPPORTED_MODELS[model],
|
145 |
-
messages=
|
|
|
|
|
|
|
146 |
max_tokens=2048,
|
147 |
response_format={"type": "json_object"},
|
148 |
temperature=st.session_state.temperature
|
149 |
)
|
150 |
-
return json.loads(response.choices[0].message.content)
|
151 |
-
except Exception as e:
|
152 |
-
if attempt == 2:
|
153 |
-
raise
|
154 |
-
time.sleep(2 ** attempt)
|
155 |
-
|
156 |
-
def qa_generation_workflow():
|
157 |
-
"""Enterprise Q&A generation pipeline"""
|
158 |
-
with st.status("🚀 AI Processing Pipeline", expanded=True) as status:
|
159 |
-
try:
|
160 |
-
st.write("Initializing neural processors...")
|
161 |
-
total_pages = len(st.session_state.document_data)
|
162 |
-
qa_pairs = []
|
163 |
-
|
164 |
-
for idx, entry in enumerate(st.session_state.document_data):
|
165 |
-
status.write(f"Processing page {idx+1}/{total_pages}")
|
166 |
-
text_content = hybrid_text_extraction(entry)
|
167 |
-
|
168 |
-
prompt = f"""Generate 3 sophisticated Q&A pairs from:
|
169 |
-
Page {entry['page']} Content:
|
170 |
-
{text_content}
|
171 |
-
|
172 |
-
Return JSON format: {{"qa_pairs": [{{"question": "...", "answer_1": "...", "answer_2": "..."}}]}}"""
|
173 |
-
|
174 |
-
response = generate_with_retry(
|
175 |
-
st.session_state.model_choice,
|
176 |
-
[{"role": "user", "content": prompt}]
|
177 |
-
)
|
178 |
-
qa_pairs.extend(response.get("qa_pairs", []))
|
179 |
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
186 |
-
def evaluation_interface():
|
187 |
-
"""Interactive quality control center"""
|
188 |
-
st.header("🧪 Quality Control Hub")
|
189 |
-
|
190 |
-
with st.expander("Automated AI Evaluation", expanded=True):
|
191 |
-
if st.button("Run Batch Validation"):
|
192 |
-
with st.spinner("Validating responses..."):
|
193 |
-
time.sleep(2) # Simulated validation
|
194 |
-
st.success("Quality check passed: 98% accuracy")
|
195 |
-
|
196 |
-
with st.expander("Human-in-the-Loop Review"):
|
197 |
-
sample_size = min(5, len(st.session_state.qa_pairs))
|
198 |
-
for idx in range(sample_size):
|
199 |
-
pair = st.session_state.qa_pairs[idx]
|
200 |
-
with st.container(border=True):
|
201 |
-
col1, col2 = st.columns([1, 3])
|
202 |
-
with col1:
|
203 |
-
st.metric("Page", pair["page"])
|
204 |
-
with col2:
|
205 |
-
st.write(f"**Question:** {pair['question']}")
|
206 |
-
|
207 |
-
tab1, tab2 = st.tabs(["Answer 1", "Answer 2"])
|
208 |
-
with tab1:
|
209 |
-
st.write(pair["answer_1"])
|
210 |
-
with tab2:
|
211 |
-
st.write(pair["answer_2"])
|
212 |
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
col1, col2, col3 = st.columns(3)
|
224 |
-
with col1:
|
225 |
-
export_format = st.selectbox("Format", ["JSON", "CSV", "Parquet"])
|
226 |
-
with col2:
|
227 |
-
compression = st.selectbox("Compression", ["None", "gzip", "zip"])
|
228 |
-
with col3:
|
229 |
-
include_metadata = st.checkbox("Include Metadata", True)
|
230 |
-
|
231 |
-
if st.button("Generate Export Package"):
|
232 |
-
with st.spinner("Packaging data..."):
|
233 |
-
df = pd.DataFrame(st.session_state.qa_pairs)
|
234 |
-
buffer = BytesIO()
|
235 |
-
|
236 |
-
if export_format == "JSON":
|
237 |
-
df.to_json(buffer, orient="records", indent=2)
|
238 |
-
mime = "application/json"
|
239 |
-
elif export_format == "CSV":
|
240 |
-
df.to_csv(buffer, index=False)
|
241 |
-
mime = "text/csv"
|
242 |
-
else:
|
243 |
-
df.to_parquet(buffer, compression=compression if compression != "None" else None)
|
244 |
-
mime = "application/octet-stream"
|
245 |
-
|
246 |
-
st.download_button(
|
247 |
-
label="Download Dataset",
|
248 |
-
data=buffer.getvalue(),
|
249 |
-
file_name=f"synthetic_data_{int(time.time())}.{export_format.lower()}",
|
250 |
-
mime=mime
|
251 |
-
)
|
252 |
-
|
253 |
-
def main_interface():
|
254 |
-
"""Core application interface"""
|
255 |
-
st.title("🏭 Synthetic Data Factory")
|
256 |
-
st.write("Industrial-scale synthetic data generation powered by cutting-edge AI")
|
257 |
-
|
258 |
-
# Processing pipeline
|
259 |
-
if uploaded_file := st.sidebar.file_uploader("Upload PDF Document", type=["pdf"]):
|
260 |
-
if st.sidebar.button("Start Generation"):
|
261 |
-
st.session_state.processing_complete = False
|
262 |
-
advanced_pdf_processor(uploaded_file)
|
263 |
-
qa_generation_workflow()
|
264 |
-
st.session_state.processing_complete = True
|
265 |
-
|
266 |
-
# Display results
|
267 |
-
if st.session_state.processing_complete:
|
268 |
-
evaluation_interface()
|
269 |
-
data_export_module()
|
270 |
|
271 |
def main():
|
272 |
-
"""Main application
|
273 |
st.set_page_config(
|
274 |
-
page_title="Synthetic Data
|
275 |
-
page_icon="
|
276 |
layout="wide"
|
277 |
)
|
278 |
|
279 |
initialize_session_state()
|
280 |
-
secure_api_management()
|
281 |
|
|
|
282 |
with st.sidebar:
|
283 |
-
st.header("⚙️
|
284 |
st.session_state.model_choice = st.selectbox(
|
285 |
-
"AI Model",
|
286 |
-
list(SUPPORTED_MODELS.keys())
|
287 |
)
|
288 |
st.session_state.temperature = st.slider(
|
289 |
-
"Creativity Level",
|
290 |
-
0.0, 1.0, 0.3
|
291 |
)
|
|
|
|
|
292 |
|
293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
294 |
|
295 |
if __name__ == "__main__":
|
296 |
main()
|
|
|
17 |
# Configuration
|
18 |
MAX_THREADS = 4
|
19 |
SUPPORTED_MODELS = {
|
20 |
+
"Deepseek": {
|
21 |
+
"model": "deepseek-chat",
|
22 |
+
"base_url": "https://api.deepseek.com/v1"
|
23 |
+
}
|
24 |
}
|
25 |
|
26 |
+
def debug_log(message):
|
27 |
+
"""Enhanced logging system"""
|
28 |
+
if st.session_state.get("debug_mode"):
|
29 |
+
st.toast(f"DEBUG: {message}", icon="🐛")
|
30 |
+
|
31 |
def initialize_session_state():
|
32 |
+
"""Initialize all session state variables with validation"""
|
33 |
+
required_keys = {
|
34 |
'document_data': [],
|
35 |
'qa_pairs': [],
|
36 |
'processing_complete': False,
|
37 |
'current_stage': 'idle',
|
38 |
'api_keys': {},
|
39 |
'model_choice': "Deepseek",
|
40 |
+
'temperature': 0.3,
|
41 |
+
'debug_mode': True
|
42 |
}
|
43 |
|
44 |
+
for key, value in required_keys.items():
|
45 |
if key not in st.session_state:
|
46 |
st.session_state[key] = value
|
47 |
|
48 |
+
def show_processing_status():
|
49 |
+
"""Visual feedback system"""
|
50 |
+
status_messages = {
|
51 |
+
'idle': "🟢 Ready to process",
|
52 |
+
'extracting': "🔍 Extracting document content...",
|
53 |
+
'generating': "🧠 Generating Q&A pairs...",
|
54 |
+
'evaluating': "📊 Evaluating results...",
|
55 |
+
'error': "❌ Processing failed"
|
56 |
+
}
|
57 |
+
|
58 |
+
status = st.session_state.current_stage
|
59 |
+
debug_log(f"Status update: {status}")
|
60 |
+
st.sidebar.markdown(f"**System Status:** {status_messages.get(status, 'Unknown')}")
|
|
|
61 |
|
62 |
def process_image(img_data, page_num, img_idx):
|
63 |
+
"""Robust image processing with validation"""
|
64 |
try:
|
65 |
img = img_data["stream"]
|
66 |
width = int(img_data["width"])
|
67 |
height = int(img_data["height"])
|
68 |
|
69 |
+
debug_log(f"Processing image {img_idx} on page {page_num}")
|
70 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
# Convert image to RGB
|
72 |
+
try:
|
73 |
+
return Image.frombytes("RGB", (width, height), img.get_data())
|
74 |
+
except:
|
75 |
+
return Image.frombytes("L", (width, height), img.get_data()).convert("RGB")
|
76 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
except Exception as e:
|
78 |
+
st.error(f"Image processing failed (Page {page_num}, Image {img_idx}): {str(e)}")
|
79 |
return None
|
80 |
|
81 |
+
def pdf_processing_workflow(uploaded_file):
|
82 |
+
"""PDF processing with real-time feedback"""
|
83 |
+
st.session_state.current_stage = 'extracting'
|
84 |
|
85 |
+
try:
|
86 |
with pdfplumber.open(uploaded_file) as pdf:
|
87 |
+
total_pages = len(pdf.pages)
|
88 |
+
progress_bar = st.progress(0)
|
89 |
+
status_text = st.empty()
|
|
|
|
|
|
|
|
|
90 |
|
91 |
+
for page_num, page in enumerate(pdf.pages, 1):
|
92 |
+
status_text.text(f"Processing page {page_num}/{total_pages}")
|
93 |
+
progress_bar.progress(page_num/total_pages)
|
94 |
+
|
95 |
+
try:
|
96 |
+
text = page.extract_text() or ""
|
97 |
+
images = [process_image(img, page_num, idx)
|
98 |
+
for idx, img in enumerate(page.images)]
|
99 |
+
|
100 |
+
st.session_state.document_data.append({
|
101 |
+
"page": page_num,
|
102 |
+
"text": text.strip(),
|
103 |
+
"images": [img for img in images if img is not None]
|
104 |
+
})
|
105 |
+
except Exception as e:
|
106 |
+
st.error(f"Page {page_num} error: {str(e)}")
|
107 |
+
|
108 |
+
time.sleep(0.1) # Simulate processing
|
109 |
+
|
110 |
+
progress_bar.empty()
|
111 |
+
status_text.success("Document processing complete!")
|
112 |
+
return True
|
113 |
|
114 |
+
except Exception as e:
|
115 |
+
st.session_state.current_stage = 'error'
|
116 |
+
st.error(f"PDF processing failed: {str(e)}")
|
117 |
+
debug_log(traceback.format_exc())
|
118 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
def generate_qa_pairs():
|
121 |
+
"""Q&A generation with validation"""
|
122 |
+
st.session_state.current_stage = 'generating'
|
123 |
+
qa_pairs = []
|
|
|
|
|
124 |
|
125 |
+
try:
|
126 |
+
client = openai.OpenAI(
|
127 |
+
base_url=SUPPORTED_MODELS[st.session_state.model_choice]["base_url"],
|
128 |
+
api_key=st.secrets["DEEPSEEK_API_KEY"]
|
129 |
+
)
|
130 |
+
|
131 |
+
for idx, entry in enumerate(st.session_state.document_data):
|
132 |
+
text_content = entry["text"] or " ".join([
|
133 |
+
pytesseract.image_to_string(img) for img in entry["images"]
|
134 |
+
])
|
135 |
+
|
136 |
response = client.chat.completions.create(
|
137 |
+
model=SUPPORTED_MODELS[st.session_state.model_choice]["model"],
|
138 |
+
messages=[{
|
139 |
+
"role": "user",
|
140 |
+
"content": f"Generate 3 Q&A pairs from:\n{text_content}\nReturn JSON format: {{'qa_pairs': [{{'question': '...', 'answer_1': '...', 'answer_2': '...'}}]}}"
|
141 |
+
}],
|
142 |
max_tokens=2048,
|
143 |
response_format={"type": "json_object"},
|
144 |
temperature=st.session_state.temperature
|
145 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
+
try:
|
148 |
+
result = json.loads(response.choices[0].message.content)
|
149 |
+
qa_pairs.extend(result.get("qa_pairs", []))
|
150 |
+
debug_log(f"Generated {len(result.get('qa_pairs', []))} pairs for page {entry['page']}")
|
151 |
+
except json.JSONDecodeError:
|
152 |
+
st.error(f"Invalid response format from API for page {entry['page']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
+
st.session_state.qa_pairs = qa_pairs
|
155 |
+
st.session_state.current_stage = 'evaluating'
|
156 |
+
return True
|
157 |
+
|
158 |
+
except Exception as e:
|
159 |
+
st.session_state.current_stage = 'error'
|
160 |
+
st.error(f"Q&A generation failed: {str(e)}")
|
161 |
+
debug_log(traceback.format_exc())
|
162 |
+
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
163 |
|
164 |
def main():
|
165 |
+
"""Main application interface"""
|
166 |
st.set_page_config(
|
167 |
+
page_title="Synthetic Data Generator",
|
168 |
+
page_icon="🧪",
|
169 |
layout="wide"
|
170 |
)
|
171 |
|
172 |
initialize_session_state()
|
|
|
173 |
|
174 |
+
# Debug panel
|
175 |
with st.sidebar:
|
176 |
+
st.header("⚙️ Configuration")
|
177 |
st.session_state.model_choice = st.selectbox(
|
178 |
+
"AI Model", list(SUPPORTED_MODELS.keys())
|
|
|
179 |
)
|
180 |
st.session_state.temperature = st.slider(
|
181 |
+
"Creativity Level", 0.0, 1.0, 0.3
|
|
|
182 |
)
|
183 |
+
st.session_state.debug_mode = st.checkbox("Debug Mode", True)
|
184 |
+
show_processing_status()
|
185 |
|
186 |
+
st.title("🧪 Synthetic Data Generator")
|
187 |
+
|
188 |
+
# File upload section
|
189 |
+
uploaded_file = st.file_uploader("Upload PDF Document", type=["pdf"])
|
190 |
+
|
191 |
+
if uploaded_file and st.button("Start Processing"):
|
192 |
+
if pdf_processing_workflow(uploaded_file):
|
193 |
+
if generate_qa_pairs():
|
194 |
+
st.success("Processing completed successfully!")
|
195 |
+
|
196 |
+
# Show results
|
197 |
+
st.header("Generated Q&A Pairs")
|
198 |
+
for idx, pair in enumerate(st.session_state.qa_pairs[:10]):
|
199 |
+
with st.expander(f"Q{idx+1}: {pair['question']}"):
|
200 |
+
st.write(f"**Answer 1:** {pair['answer_1']}")
|
201 |
+
st.write(f"**Answer 2:** {pair['answer_2']}")
|
202 |
+
|
203 |
+
# Data export
|
204 |
+
st.header("Data Export")
|
205 |
+
df = pd.DataFrame(st.session_state.qa_pairs)
|
206 |
+
st.download_button(
|
207 |
+
label="Download as CSV",
|
208 |
+
data=df.to_csv(index=False).encode('utf-8'),
|
209 |
+
file_name="synthetic_data.csv",
|
210 |
+
mime="text/csv"
|
211 |
+
)
|
212 |
+
|
213 |
+
# Debug information
|
214 |
+
if st.session_state.debug_mode:
|
215 |
+
with st.expander("Debug Information"):
|
216 |
+
st.write("### Session State")
|
217 |
+
st.json(st.session_state)
|
218 |
+
|
219 |
+
if st.session_state.get("document_data"):
|
220 |
+
st.write("### Document Data Summary")
|
221 |
+
st.write(f"Pages processed: {len(st.session_state.document_data)}")
|
222 |
+
st.write(f"Total images extracted: {sum(len(p['images']) for p in st.session_state.document_data)}")
|
223 |
+
|
224 |
+
if st.session_state.get("qa_pairs"):
|
225 |
+
st.write("### Q&A Statistics")
|
226 |
+
st.write(f"Total pairs generated: {len(st.session_state.qa_pairs)}")
|
227 |
+
st.write("Sample Q&A pairs:")
|
228 |
+
st.table(pd.DataFrame(st.session_state.qa_pairs[:3]))
|
229 |
|
230 |
if __name__ == "__main__":
|
231 |
main()
|