Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
-
import ast
|
2 |
import json
|
3 |
import requests
|
4 |
import streamlit as st
|
5 |
import pdfplumber
|
6 |
import pandas as pd
|
7 |
import sqlalchemy
|
8 |
-
from typing import Any, Dict, List
|
|
|
9 |
|
10 |
-
# Provider clients
|
11 |
try:
|
12 |
from openai import OpenAI
|
13 |
except ImportError:
|
@@ -18,419 +18,261 @@ try:
|
|
18 |
except ImportError:
|
19 |
groq = None
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
DEFAULT_TEMPERATURE = 0.1
|
24 |
-
GROQ_MODEL = "mixtral-8x7b-32768"
|
25 |
-
|
26 |
-
|
27 |
-
class QADataGenerator:
|
28 |
-
"""
|
29 |
-
A Q&A Synthetic Generator that extracts and generates question-answer pairs
|
30 |
-
from various input sources using an LLM provider.
|
31 |
-
"""
|
32 |
-
def __init__(self) -> None:
|
33 |
-
self._setup_providers()
|
34 |
-
self._setup_input_handlers()
|
35 |
-
self._initialize_session_state()
|
36 |
-
# This prompt instructs the LLM to generate a configurable number of Q&A pairs.
|
37 |
-
# Note: Literal curly braces in the example are escaped with double braces.
|
38 |
-
self.custom_prompt_template = (
|
39 |
-
"You are an expert in extracting question and answer pairs from documents. "
|
40 |
-
"Generate {num_pairs} Q&A pairs from the following data, formatted as a JSON list of dictionaries. "
|
41 |
-
"Each dictionary must have keys 'question' and 'answer'. "
|
42 |
-
"The questions should be clear and concise, and the answers must be based solely on the provided data with no external information. "
|
43 |
-
"Do not hallucinate.\n\n"
|
44 |
-
"Example JSON Output for {num_pairs} pairs:\n"
|
45 |
-
"[{{'question': 'Example question 1', 'answer': 'Example answer 1'}}, "
|
46 |
-
"{{'question': 'Example question 2', 'answer': 'Example answer 2'}}, "
|
47 |
-
"..., "
|
48 |
-
"{{'question': 'Example question {num_pairs}', 'answer': 'Example answer {num_pairs}'}}]\n\n"
|
49 |
-
"Now, generate {num_pairs} Q&A pairs from this data:\n{data}"
|
50 |
-
)
|
51 |
|
52 |
-
|
53 |
-
""
|
54 |
-
|
55 |
-
"
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
"
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
def _setup_input_handlers(self) -> None:
|
74 |
-
"""Register handlers for different input data types."""
|
75 |
-
self.input_handlers: Dict[str, Any] = {
|
76 |
-
"text": self.handle_text,
|
77 |
-
"pdf": self.handle_pdf,
|
78 |
-
"csv": self.handle_csv,
|
79 |
-
"api": self.handle_api,
|
80 |
-
"db": self.handle_db,
|
81 |
}
|
82 |
-
|
83 |
-
|
84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
defaults = {
|
86 |
-
"
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
91 |
},
|
92 |
-
"
|
93 |
-
"inputs": [], # List to store input sources
|
94 |
-
"qa_pairs": [], # Generated Q&A pairs output
|
95 |
-
"error_logs": [], # To store any error messages
|
96 |
-
"raw_response": "", # Store raw API response (if needed)
|
97 |
}
|
98 |
-
for key,
|
99 |
if key not in st.session_state:
|
100 |
-
st.session_state[key] =
|
101 |
-
|
102 |
-
def
|
103 |
-
"""
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
def
|
124 |
-
|
125 |
-
df = pd.read_csv(file)
|
126 |
-
# Convert the DataFrame to a JSON string
|
127 |
-
return {"data": df.to_json(orient="records"), "source": "csv"}
|
128 |
-
except Exception as e:
|
129 |
-
self.log_error(f"CSV Processing Error: {e}")
|
130 |
-
return {"data": "", "source": "csv"}
|
131 |
-
|
132 |
-
def handle_api(self, config: Dict[str, str]) -> Dict[str, Any]:
|
133 |
-
try:
|
134 |
-
response = requests.get(config["url"], headers=config.get("headers", {}), timeout=10)
|
135 |
-
response.raise_for_status()
|
136 |
-
return {"data": json.dumps(response.json()), "source": "api"}
|
137 |
-
except Exception as e:
|
138 |
-
self.log_error(f"API Processing Error: {e}")
|
139 |
-
return {"data": "", "source": "api"}
|
140 |
-
|
141 |
-
def handle_db(self, config: Dict[str, str]) -> Dict[str, Any]:
|
142 |
try:
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
|
|
|
|
|
|
|
|
148 |
except Exception as e:
|
149 |
-
self.
|
150 |
-
return
|
151 |
-
|
152 |
-
def
|
153 |
-
"""
|
154 |
-
|
155 |
-
|
156 |
-
aggregated_data += f"Source: {item.get('source', 'unknown')}\n"
|
157 |
-
aggregated_data += item.get("data", "") + "\n\n"
|
158 |
-
return aggregated_data.strip()
|
159 |
-
|
160 |
-
def build_prompt(self) -> str:
|
161 |
-
"""
|
162 |
-
Build the complete prompt using the custom template and aggregated inputs.
|
163 |
-
The number of Q&A pairs is inserted via the {num_pairs} placeholder.
|
164 |
-
"""
|
165 |
-
data = self.aggregate_inputs()
|
166 |
-
num_pairs = st.session_state.config.get("num_pairs", 3)
|
167 |
-
prompt = self.custom_prompt_template.format(data=data, num_pairs=num_pairs)
|
168 |
-
st.write("### Built Prompt")
|
169 |
-
st.write(prompt)
|
170 |
-
return prompt
|
171 |
-
|
172 |
-
def generate_qa_pairs(self) -> bool:
|
173 |
-
"""
|
174 |
-
Generate Q&A pairs by sending the built prompt to the selected LLM provider.
|
175 |
-
"""
|
176 |
-
api_key = st.session_state.api_key
|
177 |
-
if not api_key:
|
178 |
-
self.log_error("API key is missing!")
|
179 |
-
return False
|
180 |
-
|
181 |
-
provider_name = st.session_state.config["provider"]
|
182 |
-
provider_cfg = self.providers.get(provider_name)
|
183 |
-
if not provider_cfg:
|
184 |
-
self.log_error(f"Provider {provider_name} is not configured.")
|
185 |
-
return False
|
186 |
-
|
187 |
-
client_initializer = provider_cfg["client"]
|
188 |
-
client = client_initializer(api_key)
|
189 |
-
model = st.session_state.config["model"]
|
190 |
-
temperature = st.session_state.config["temperature"]
|
191 |
-
prompt = self.build_prompt()
|
192 |
|
193 |
-
|
|
|
|
|
194 |
try:
|
195 |
-
if
|
196 |
-
|
|
|
|
|
197 |
else:
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
qa_pairs = self._parse_response(response, provider_name)
|
205 |
-
st.write("### Parsed Q&A Pairs")
|
206 |
-
st.write(qa_pairs)
|
207 |
-
|
208 |
-
st.session_state.qa_pairs = qa_pairs
|
209 |
-
return True
|
210 |
except Exception as e:
|
211 |
-
self.
|
212 |
-
return False
|
213 |
-
|
214 |
-
def _standard_inference(self, client: Any, prompt: str, model: str, temperature: float) -> Any:
|
215 |
-
"""Inference method for providers using an OpenAI-compatible API."""
|
216 |
-
try:
|
217 |
-
st.write("Sending prompt via standard inference...")
|
218 |
-
result = client.chat.completions.create(
|
219 |
-
model=model,
|
220 |
-
messages=[{"role": "user", "content": prompt}],
|
221 |
-
temperature=temperature,
|
222 |
-
)
|
223 |
-
st.write("Standard inference result received.")
|
224 |
-
return result
|
225 |
-
except Exception as e:
|
226 |
-
self.log_error(f"Standard Inference Error: {e}")
|
227 |
return None
|
228 |
-
|
229 |
-
def
|
230 |
-
"""
|
231 |
-
|
232 |
-
|
|
|
233 |
response = requests.post(
|
234 |
-
|
235 |
headers=client["headers"],
|
236 |
json={"inputs": prompt},
|
237 |
-
timeout=30
|
238 |
)
|
239 |
response.raise_for_status()
|
240 |
-
st.write("HuggingFace API response received.")
|
241 |
return response.json()
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
raw_text = response[0]["generated_text"]
|
257 |
-
else:
|
258 |
-
self.log_error("Unexpected HuggingFace response format.")
|
259 |
-
return []
|
260 |
-
else:
|
261 |
-
if response and hasattr(response, "choices") and response.choices:
|
262 |
-
raw_text = response.choices[0].message.content
|
263 |
-
else:
|
264 |
-
self.log_error("Unexpected response format from provider.")
|
265 |
-
return []
|
266 |
-
|
267 |
-
# Try parsing as JSON first
|
268 |
-
try:
|
269 |
-
qa_list = json.loads(raw_text)
|
270 |
-
if isinstance(qa_list, list):
|
271 |
-
return qa_list
|
272 |
-
else:
|
273 |
-
self.log_error("Parsed output is not a list.")
|
274 |
-
return []
|
275 |
-
except json.JSONDecodeError:
|
276 |
-
st.write("Standard JSON parsing failed. Falling back to ast.literal_eval...")
|
277 |
try:
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
|
|
|
|
287 |
except Exception as e:
|
288 |
-
self.
|
289 |
-
return
|
290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
291 |
|
292 |
-
#
|
293 |
|
294 |
-
|
295 |
-
|
296 |
-
|
297 |
-
st.
|
298 |
-
|
299 |
-
st.session_state.config["provider"] = provider
|
300 |
-
provider_cfg = generator.providers[provider]
|
301 |
-
|
302 |
-
model = st.selectbox("Select Model", provider_cfg["models"])
|
303 |
-
st.session_state.config["model"] = model
|
304 |
-
|
305 |
-
temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
|
306 |
-
st.session_state.config["temperature"] = temperature
|
307 |
|
308 |
-
|
309 |
-
|
310 |
-
st.session_state.config["num_pairs"] = num_pairs
|
311 |
-
|
312 |
-
api_key = st.text_input(f"{provider} API Key", type="password")
|
313 |
-
st.session_state.api_key = api_key
|
314 |
|
315 |
-
def
|
316 |
-
|
317 |
-
|
318 |
-
|
319 |
-
|
320 |
-
|
321 |
-
|
322 |
-
|
323 |
-
|
324 |
-
|
325 |
-
st.success("Text input added!")
|
326 |
-
else:
|
327 |
-
st.warning("Empty text input.")
|
328 |
-
|
329 |
-
with tabs[1]:
|
330 |
-
pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
|
331 |
-
if pdf_file is not None:
|
332 |
-
st.session_state.inputs.append(generator.handle_pdf(pdf_file))
|
333 |
-
st.success("PDF input added!")
|
334 |
-
|
335 |
-
with tabs[2]:
|
336 |
-
csv_file = st.file_uploader("Upload CSV", type=["csv"])
|
337 |
-
if csv_file is not None:
|
338 |
-
st.session_state.inputs.append(generator.handle_csv(csv_file))
|
339 |
-
st.success("CSV input added!")
|
340 |
-
|
341 |
-
with tabs[3]:
|
342 |
-
api_url = st.text_input("API Endpoint URL")
|
343 |
-
api_headers = st.text_area("API Headers (JSON format, optional)", height=100)
|
344 |
-
if st.button("Add API Input", key="api_input"):
|
345 |
-
headers = {}
|
346 |
-
try:
|
347 |
-
if api_headers:
|
348 |
-
headers = json.loads(api_headers)
|
349 |
-
except Exception as e:
|
350 |
-
generator.log_error(f"Invalid JSON for API Headers: {e}")
|
351 |
-
st.session_state.inputs.append(generator.handle_api({"url": api_url, "headers": headers}))
|
352 |
-
st.success("API input added!")
|
353 |
-
|
354 |
-
with tabs[4]:
|
355 |
-
db_conn = st.text_input("Database Connection String")
|
356 |
-
db_query = st.text_area("Database Query", height=100)
|
357 |
-
if st.button("Add Database Input", key="db_input"):
|
358 |
-
st.session_state.inputs.append(generator.handle_db({"connection": db_conn, "query": db_query}))
|
359 |
-
st.success("Database input added!")
|
360 |
|
361 |
-
def
|
362 |
-
|
363 |
-
st.subheader("Q&A Pairs Output")
|
364 |
-
if st.session_state.qa_pairs:
|
365 |
-
st.write("### Generated Q&A Pairs (Table)")
|
366 |
try:
|
367 |
-
|
368 |
-
|
369 |
-
|
370 |
-
|
371 |
-
|
372 |
-
|
373 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
374 |
|
375 |
-
|
376 |
-
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
)
|
381 |
-
|
382 |
-
|
383 |
-
|
384 |
-
|
385 |
-
|
|
|
|
|
386 |
)
|
387 |
-
|
388 |
-
|
389 |
-
|
390 |
-
|
391 |
-
|
392 |
-
|
393 |
-
|
394 |
-
|
395 |
-
|
396 |
-
|
397 |
-
|
398 |
-
|
|
|
399 |
|
400 |
def main():
|
401 |
-
|
402 |
-
st.
|
403 |
-
|
404 |
-
""
|
405 |
-
|
406 |
-
from various input sources. Configure your provider in the sidebar, add input data, and click the button below to generate Q&A pairs.
|
407 |
-
"""
|
408 |
)
|
409 |
|
410 |
-
|
411 |
-
generator = QADataGenerator()
|
412 |
-
config_ui(generator)
|
413 |
|
414 |
-
st.
|
415 |
-
|
416 |
-
|
417 |
-
|
418 |
-
|
419 |
|
420 |
-
|
421 |
-
if st.button("Generate Q&A Pairs", key="generate_qa"):
|
422 |
-
with st.spinner("Generating Q&A pairs..."):
|
423 |
-
if generator.generate_qa_pairs():
|
424 |
-
st.success("Q&A pairs generated successfully!")
|
425 |
-
else:
|
426 |
-
st.error("Q&A generation failed. Check logs for details.")
|
427 |
|
428 |
-
|
429 |
-
output_ui(generator)
|
430 |
|
431 |
-
st.header("4. Logs & Debug Information")
|
432 |
-
logs_ui()
|
433 |
-
|
434 |
-
|
435 |
if __name__ == "__main__":
|
436 |
-
main()
|
|
|
|
|
1 |
import json
|
2 |
import requests
|
3 |
import streamlit as st
|
4 |
import pdfplumber
|
5 |
import pandas as pd
|
6 |
import sqlalchemy
|
7 |
+
from typing import Any, Dict, List, Optional
|
8 |
+
from functools import lru_cache
|
9 |
|
10 |
+
# Provider clients with import guards
|
11 |
try:
|
12 |
from openai import OpenAI
|
13 |
except ImportError:
|
|
|
18 |
except ImportError:
|
19 |
groq = None
|
20 |
|
21 |
+
class SyntheticDataGenerator:
|
22 |
+
"""World's Most Advanced Synthetic Data Generation System"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
+
PROVIDER_CONFIG = {
|
25 |
+
"Deepseek": {
|
26 |
+
"base_url": "https://api.deepseek.com/v1",
|
27 |
+
"models": ["deepseek-chat"],
|
28 |
+
"requires_library": "openai"
|
29 |
+
},
|
30 |
+
"OpenAI": {
|
31 |
+
"base_url": "https://api.openai.com/v1",
|
32 |
+
"models": ["gpt-4-turbo", "gpt-3.5-turbo"],
|
33 |
+
"requires_library": "openai"
|
34 |
+
},
|
35 |
+
"Groq": {
|
36 |
+
"base_url": "https://api.groq.com/openai/v1",
|
37 |
+
"models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
|
38 |
+
"requires_library": "groq"
|
39 |
+
},
|
40 |
+
"HuggingFace": {
|
41 |
+
"base_url": "https://api-inference.huggingface.co/models/",
|
42 |
+
"models": ["gpt2", "llama-2-13b-chat"],
|
43 |
+
"requires_library": None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
}
|
45 |
+
}
|
46 |
+
|
47 |
+
def __init__(self):
|
48 |
+
self._init_session_state()
|
49 |
+
self._setup_input_handlers()
|
50 |
+
self._setup_providers()
|
51 |
+
|
52 |
+
def _init_session_state(self):
|
53 |
+
"""Initialize enterprise-grade session management"""
|
54 |
defaults = {
|
55 |
+
"active_provider": "OpenAI",
|
56 |
+
"api_keys": {},
|
57 |
+
"input_sources": [],
|
58 |
+
"generation_results": [],
|
59 |
+
"system_metrics": {
|
60 |
+
"api_calls": 0,
|
61 |
+
"tokens_used": 0,
|
62 |
+
"error_count": 0
|
63 |
},
|
64 |
+
"debug_mode": False
|
|
|
|
|
|
|
|
|
65 |
}
|
66 |
+
for key, val in defaults.items():
|
67 |
if key not in st.session_state:
|
68 |
+
st.session_state[key] = val
|
69 |
+
|
70 |
+
def _setup_providers(self):
|
71 |
+
"""Configure available providers with health checks"""
|
72 |
+
self.available_providers = []
|
73 |
+
for provider, config in self.PROVIDER_CONFIG.items():
|
74 |
+
if config["requires_library"] and not globals().get(config["requires_library"].title()):
|
75 |
+
continue # Skip providers with missing dependencies
|
76 |
+
self.available_providers.append(provider)
|
77 |
+
|
78 |
+
def _setup_input_handlers(self):
|
79 |
+
"""Register enterprise input processors"""
|
80 |
+
self.input_processors = {
|
81 |
+
"text": self._process_text,
|
82 |
+
"pdf": self._process_pdf,
|
83 |
+
"csv": self._process_csv,
|
84 |
+
"api": self._process_api,
|
85 |
+
"database": self._process_database,
|
86 |
+
"web": self._process_web
|
87 |
+
}
|
88 |
+
|
89 |
+
# --- Core Generation Engine ---
|
90 |
+
@lru_cache(maxsize=100)
|
91 |
+
def generate(self, provider: str, model: str, prompt: str) -> Dict[str, Any]:
|
92 |
+
"""Unified generation endpoint with failover support"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
try:
|
94 |
+
if provider not in self.available_providers:
|
95 |
+
raise ValueError(f"Provider {provider} not available")
|
96 |
+
|
97 |
+
client = self._get_client(provider)
|
98 |
+
if not client:
|
99 |
+
raise ConnectionError("Client initialization failed")
|
100 |
+
|
101 |
+
return self._execute_generation(client, provider, model, prompt)
|
102 |
+
|
103 |
except Exception as e:
|
104 |
+
self._log_error(f"Generation Error: {str(e)}")
|
105 |
+
return self._failover_generation(prompt)
|
106 |
+
|
107 |
+
def _get_client(self, provider: str) -> Any:
|
108 |
+
"""Secure client initialization with connection pooling"""
|
109 |
+
config = self.PROVIDER_CONFIG[provider]
|
110 |
+
api_key = st.session_state.api_keys.get(provider, "")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
|
112 |
+
if not api_key:
|
113 |
+
raise ValueError("API key required")
|
114 |
+
|
115 |
try:
|
116 |
+
if provider == "Groq":
|
117 |
+
return groq.Groq(api_key=api_key)
|
118 |
+
elif provider == "HuggingFace":
|
119 |
+
return {"headers": {"Authorization": f"Bearer {api_key}"}}
|
120 |
else:
|
121 |
+
return OpenAI(
|
122 |
+
base_url=config["base_url"],
|
123 |
+
api_key=api_key,
|
124 |
+
timeout=30
|
125 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
except Exception as e:
|
127 |
+
self._log_error(f"Client Init Failed: {str(e)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
return None
|
129 |
+
|
130 |
+
def _execute_generation(self, client, provider: str, model: str, prompt: str) -> Dict[str, Any]:
|
131 |
+
"""Execute provider-specific generation with circuit breaker"""
|
132 |
+
st.session_state.system_metrics["api_calls"] += 1
|
133 |
+
|
134 |
+
if provider == "HuggingFace":
|
135 |
response = requests.post(
|
136 |
+
self.PROVIDER_CONFIG[provider]["base_url"] + model,
|
137 |
headers=client["headers"],
|
138 |
json={"inputs": prompt},
|
139 |
+
timeout=30
|
140 |
)
|
141 |
response.raise_for_status()
|
|
|
142 |
return response.json()
|
143 |
+
else:
|
144 |
+
completion = client.chat.completions.create(
|
145 |
+
model=model,
|
146 |
+
messages=[{"role": "user", "content": prompt}],
|
147 |
+
temperature=0.1,
|
148 |
+
max_tokens=2000
|
149 |
+
)
|
150 |
+
st.session_state.system_metrics["tokens_used"] += completion.usage.total_tokens
|
151 |
+
return json.loads(completion.choices[0].message.content)
|
152 |
+
|
153 |
+
def _failover_generation(self, prompt: str) -> Dict[str, Any]:
|
154 |
+
"""Enterprise failover to secondary providers"""
|
155 |
+
for backup_provider in self.available_providers:
|
156 |
+
if backup_provider != st.session_state.active_provider:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
157 |
try:
|
158 |
+
return self.generate(backup_provider, ...)
|
159 |
+
except Exception:
|
160 |
+
continue
|
161 |
+
raise RuntimeError("All generation providers unavailable")
|
162 |
+
|
163 |
+
# --- Input Processors ---
|
164 |
+
def _process_pdf(self, file) -> str:
|
165 |
+
"""Advanced PDF processing with OCR fallback"""
|
166 |
+
try:
|
167 |
+
with pdfplumber.open(file) as pdf:
|
168 |
+
return "\n".join(page.extract_text() or "" for page in pdf.pages)
|
169 |
except Exception as e:
|
170 |
+
self._log_error(f"PDF Processing Error: {str(e)}")
|
171 |
+
return ""
|
172 |
|
173 |
+
def _process_web(self, url: str) -> str:
|
174 |
+
"""Web content extraction with anti-bot measures"""
|
175 |
+
try:
|
176 |
+
response = requests.get(url, headers={
|
177 |
+
"User-Agent": "Mozilla/5.0 (compatible; SyntheticBot/1.0)"
|
178 |
+
}, timeout=10)
|
179 |
+
return response.text
|
180 |
+
except Exception as e:
|
181 |
+
self._log_error(f"Web Extraction Error: {str(e)}")
|
182 |
+
return ""
|
183 |
|
184 |
+
# Additional processors follow similar patterns...
|
185 |
|
186 |
+
# --- Enterprise Features ---
|
187 |
+
def _log_error(self, message: str) -> None:
|
188 |
+
"""Centralized error logging with telemetry"""
|
189 |
+
st.session_state.system_metrics["error_count"] += 1
|
190 |
+
st.session_state.error_logs = st.session_state.get("error_logs", []) + [message]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
|
192 |
+
if st.session_state.debug_mode:
|
193 |
+
st.error(f"[DEBUG] {message}")
|
|
|
|
|
|
|
|
|
194 |
|
195 |
+
def health_check(self) -> Dict[str, Any]:
|
196 |
+
"""Comprehensive system diagnostics"""
|
197 |
+
return {
|
198 |
+
"providers_available": self.available_providers,
|
199 |
+
"api_connectivity": {
|
200 |
+
provider: self._test_provider_connectivity(provider)
|
201 |
+
for provider in self.available_providers
|
202 |
+
},
|
203 |
+
"system_metrics": st.session_state.system_metrics
|
204 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
205 |
|
206 |
+
def _test_provider_connectivity(self, provider: str) -> bool:
|
207 |
+
"""Provider-specific connectivity test"""
|
|
|
|
|
|
|
208 |
try:
|
209 |
+
client = self._get_client(provider)
|
210 |
+
if provider == "HuggingFace":
|
211 |
+
response = requests.get(
|
212 |
+
self.PROVIDER_CONFIG[provider]["base_url"],
|
213 |
+
headers=client["headers"],
|
214 |
+
timeout=5
|
215 |
+
)
|
216 |
+
return response.status_code == 200
|
217 |
+
else:
|
218 |
+
client.models.list()
|
219 |
+
return True
|
220 |
+
except Exception:
|
221 |
+
return False
|
222 |
+
|
223 |
+
# --- Enterprise UI Components ---
|
224 |
+
def provider_config_ui(gen: SyntheticDataGenerator):
|
225 |
+
"""Advanced provider configuration interface"""
|
226 |
+
with st.sidebar:
|
227 |
+
st.header("⚙️ AI Engine Configuration")
|
228 |
|
229 |
+
# Provider selection with availability checks
|
230 |
+
provider = st.selectbox(
|
231 |
+
"AI Provider",
|
232 |
+
gen.available_providers,
|
233 |
+
help="Available providers based on system configuration"
|
234 |
)
|
235 |
+
|
236 |
+
# API key management
|
237 |
+
api_key = st.text_input(
|
238 |
+
f"{provider} API Key",
|
239 |
+
type="password",
|
240 |
+
value=st.session_state.api_keys.get(provider, ""),
|
241 |
+
help=f"Obtain API key from {provider} portal"
|
242 |
)
|
243 |
+
st.session_state.api_keys[provider] = api_key
|
244 |
+
|
245 |
+
# Model selection
|
246 |
+
model = st.selectbox(
|
247 |
+
"Model",
|
248 |
+
gen.PROVIDER_CONFIG[provider]["models"],
|
249 |
+
help="Select model version based on your API plan"
|
250 |
+
)
|
251 |
+
|
252 |
+
# System monitoring
|
253 |
+
if st.button("Run Health Check"):
|
254 |
+
report = gen.health_check()
|
255 |
+
st.json(report)
|
256 |
|
257 |
def main():
|
258 |
+
"""Enterprise-grade user interface"""
|
259 |
+
st.set_page_config(
|
260 |
+
page_title="Synthetic Data Factory Pro",
|
261 |
+
page_icon="🏭",
|
262 |
+
layout="wide"
|
|
|
|
|
263 |
)
|
264 |
|
265 |
+
gen = SyntheticDataGenerator()
|
|
|
|
|
266 |
|
267 |
+
st.title("🏭 Synthetic Data Factory Pro")
|
268 |
+
st.markdown("""
|
269 |
+
**World's Most Advanced Synthetic Data Generation Platform**
|
270 |
+
*Multi-provider AI Engine | Enterprise Input Processors | Real-time Monitoring*
|
271 |
+
""")
|
272 |
|
273 |
+
provider_config_ui(gen)
|
|
|
|
|
|
|
|
|
|
|
|
|
274 |
|
275 |
+
# Input management and generation UI components...
|
|
|
276 |
|
|
|
|
|
|
|
|
|
277 |
if __name__ == "__main__":
|
278 |
+
main()
|