Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import requests
|
2 |
import streamlit as st
|
3 |
import pdfplumber
|
@@ -5,13 +6,6 @@ import pandas as pd
|
|
5 |
import sqlalchemy
|
6 |
from typing import Any, Dict, List, Optional, Union
|
7 |
from functools import lru_cache
|
8 |
-
import json # Explicit import
|
9 |
-
import os
|
10 |
-
|
11 |
-
# --- Constants ---
|
12 |
-
DEFAULT_TEMPERATURE = 0.1
|
13 |
-
DEFAULT_MAX_TOKENS = 2000
|
14 |
-
API_TIMEOUT = 30
|
15 |
|
16 |
# Provider clients with import guards
|
17 |
try:
|
@@ -24,58 +18,41 @@ try:
|
|
24 |
except ImportError:
|
25 |
groq = None
|
26 |
|
27 |
-
try:
|
28 |
-
import google.generativeai as genai
|
29 |
-
from google.generativeai import GenerativeModel, configure, Part
|
30 |
-
except ImportError:
|
31 |
-
GenerativeModel = None
|
32 |
-
configure = None
|
33 |
-
genai = None
|
34 |
-
Part = None
|
35 |
-
|
36 |
|
37 |
class SyntheticDataGenerator:
|
38 |
"""World's Most Advanced Synthetic Data Generation System"""
|
39 |
|
40 |
-
PROVIDER_CONFIG = {
|
41 |
"Deepseek": {
|
42 |
"base_url": "https://api.deepseek.com/v1",
|
43 |
"models": ["deepseek-chat"],
|
44 |
"requires_library": "openai",
|
45 |
-
"supports_json_output": True, # Indicate that the provider reliably returns JSON
|
46 |
},
|
47 |
"OpenAI": {
|
48 |
"base_url": "https://api.openai.com/v1",
|
49 |
"models": ["gpt-4-turbo", "gpt-3.5-turbo"],
|
50 |
"requires_library": "openai",
|
51 |
-
"supports_json_output": True,
|
52 |
},
|
53 |
"Groq": {
|
54 |
"base_url": "https://api.groq.com/openai/v1",
|
55 |
"models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
|
56 |
"requires_library": "groq",
|
57 |
-
"supports_json_output": True,
|
58 |
},
|
59 |
"HuggingFace": {
|
60 |
"base_url": "https://api-inference.huggingface.co/models/",
|
61 |
"models": ["gpt2", "llama-2-13b-chat"],
|
62 |
"requires_library": None,
|
63 |
-
"supports_json_output": False, # More likely to return text
|
64 |
-
},
|
65 |
-
"Google": {
|
66 |
-
"models": ["gemini-1.5-flash-latest", "gemini-1.5-pro-latest", "gemini-pro", "gemini-pro-vision"],
|
67 |
-
"requires_library": "google.generativeai",
|
68 |
-
"supports_json_output": True
|
69 |
},
|
70 |
}
|
71 |
|
72 |
-
def __init__(self):
|
|
|
73 |
self._init_session_state()
|
74 |
self._setup_input_handlers()
|
75 |
self._setup_providers()
|
76 |
|
77 |
-
def _init_session_state(self):
|
78 |
-
"""Initialize enterprise-grade session management"""
|
79 |
defaults = {
|
80 |
"active_provider": "OpenAI",
|
81 |
"api_keys": {},
|
@@ -86,52 +63,24 @@ class SyntheticDataGenerator:
|
|
86 |
"tokens_used": 0,
|
87 |
"error_count": 0,
|
88 |
},
|
|
|
89 |
"debug_mode": False,
|
90 |
-
"temperature": DEFAULT_TEMPERATURE, # Add temperature control
|
91 |
-
"max_tokens": DEFAULT_MAX_TOKENS, # Add max token control
|
92 |
-
"use_streaming": False, # Control Streaming behavior
|
93 |
-
"prompt_template": None, # Support prompt templates
|
94 |
-
"api_call_timeout": API_TIMEOUT, # API call timeout
|
95 |
-
"image_parts": [], # Store image parts for multimodal generation
|
96 |
-
"top_p": 0.95, # Default top_p for Google
|
97 |
-
"top_k": 40, # Default top_k for Google
|
98 |
-
"safety_settings": self._get_default_safety_settings(), #Default Safety Settings
|
99 |
}
|
100 |
for key, val in defaults.items():
|
101 |
if key not in st.session_state:
|
102 |
st.session_state[key] = val
|
103 |
|
104 |
-
def
|
105 |
-
|
106 |
-
|
107 |
-
{
|
108 |
-
"category": "HARM_CATEGORY_HARASSMENT",
|
109 |
-
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
110 |
-
},
|
111 |
-
{
|
112 |
-
"category": "HARM_CATEGORY_HATE_SPEECH",
|
113 |
-
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
114 |
-
},
|
115 |
-
{
|
116 |
-
"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
117 |
-
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
118 |
-
},
|
119 |
-
{
|
120 |
-
"category": "HARM_CATEGORY_DANGEROUS_CONTENT",
|
121 |
-
"threshold": "BLOCK_MEDIUM_AND_ABOVE"
|
122 |
-
},
|
123 |
-
]
|
124 |
-
|
125 |
-
def _setup_providers(self):
|
126 |
-
"""Configure available providers with health checks"""
|
127 |
-
self.available_providers = []
|
128 |
for provider, config in self.PROVIDER_CONFIG.items():
|
129 |
-
|
|
|
130 |
continue # Skip providers with missing dependencies
|
131 |
self.available_providers.append(provider)
|
132 |
|
133 |
-
def _setup_input_handlers(self):
|
134 |
-
"""Register enterprise input processors"""
|
135 |
self.input_processors = {
|
136 |
"text": self._process_text,
|
137 |
"pdf": self._process_pdf,
|
@@ -139,149 +88,117 @@ class SyntheticDataGenerator:
|
|
139 |
"api": self._process_api,
|
140 |
"database": self._process_database,
|
141 |
"web": self._process_web,
|
142 |
-
"prompt_template": self._process_prompt_template,
|
143 |
-
"image": self._process_image,
|
144 |
}
|
145 |
|
146 |
-
# --- Core Generation Engine ---
|
147 |
@lru_cache(maxsize=100)
|
148 |
-
def generate(self, provider: str, model: str, prompt:
|
149 |
-
"""
|
|
|
|
|
|
|
150 |
try:
|
151 |
if provider not in self.available_providers:
|
152 |
-
raise ValueError(f"Provider {provider} not available")
|
153 |
|
154 |
client = self._get_client(provider)
|
155 |
if not client:
|
156 |
-
raise ConnectionError("Client initialization failed")
|
157 |
|
158 |
return self._execute_generation(client, provider, model, prompt)
|
159 |
|
160 |
except Exception as e:
|
161 |
-
self._log_error(f"Generation Error: {str(e)}")
|
162 |
-
return self._failover_generation(prompt)
|
163 |
|
164 |
def _get_client(self, provider: str) -> Any:
|
165 |
-
"""
|
|
|
|
|
|
|
166 |
config = self.PROVIDER_CONFIG[provider]
|
167 |
-
api_key = st.session_state
|
168 |
|
169 |
-
if not api_key
|
170 |
-
raise ValueError(f"API key required for provider
|
171 |
|
172 |
try:
|
173 |
if provider == "Groq":
|
174 |
return groq.Groq(api_key=api_key)
|
175 |
elif provider == "HuggingFace":
|
176 |
return {"headers": {"Authorization": f"Bearer {api_key}"}}
|
177 |
-
elif provider == "Google":
|
178 |
-
if not st.session_state.google_configured:
|
179 |
-
if "GOOGLE_API_KEY" in os.environ:
|
180 |
-
api_key = os.environ["GOOGLE_API_KEY"]
|
181 |
-
else:
|
182 |
-
api_key = st.session_state.api_keys.get("Google", "")
|
183 |
-
if not api_key:
|
184 |
-
raise ValueError(
|
185 |
-
"Google API key is required. Please set it in the app or as the GOOGLE_API_KEY environment variable.")
|
186 |
-
try:
|
187 |
-
configure(api_key=api_key) # Moved configure into try block
|
188 |
-
st.session_state.google_configured = True
|
189 |
-
except Exception as e:
|
190 |
-
raise ValueError(f"Error configuring Google API: {e}")
|
191 |
-
|
192 |
-
generation_config = genai.GenerationConfig(
|
193 |
-
temperature=st.session_state["temperature"],
|
194 |
-
top_p=st.session_state["top_p"],
|
195 |
-
top_k=st.session_state["top_k"],
|
196 |
-
max_output_tokens=st.session_state["max_tokens"],
|
197 |
-
)
|
198 |
-
safety_settings = st.session_state["safety_settings"] #Get Safety Settings
|
199 |
-
|
200 |
-
return GenerativeModel(model_name=model, generation_config=generation_config,
|
201 |
-
safety_settings=safety_settings) # Use all settings
|
202 |
else:
|
|
|
203 |
return OpenAI(
|
204 |
base_url=config["base_url"],
|
205 |
api_key=api_key,
|
206 |
-
timeout=
|
207 |
)
|
208 |
except Exception as e:
|
209 |
-
self._log_error(f"Client
|
210 |
return None
|
211 |
|
212 |
-
def _execute_generation(self, client, provider: str, model: str, prompt:
|
213 |
-
"""
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
-
|
224 |
-
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
return {"content": content} #Return raw text
|
245 |
-
|
246 |
-
else:
|
247 |
-
completion = client.chat.completions.create(
|
248 |
-
model=model,
|
249 |
-
messages=[{"role": "user", "content": prompt}],
|
250 |
-
temperature=st.session_state["temperature"], # Get temperature from session
|
251 |
-
max_tokens=st.session_state["max_tokens"], # Get max_tokens from session
|
252 |
-
stream=st.session_state["use_streaming"], # Use streaming bool from session
|
253 |
-
)
|
254 |
-
st.session_state.system_metrics["tokens_used"] += completion.usage.total_tokens
|
255 |
-
content = completion.choices[0].message.content
|
256 |
-
# Attempt to parse JSON if supported, otherwise return text
|
257 |
-
if self.PROVIDER_CONFIG[provider]["supports_json_output"]:
|
258 |
-
try:
|
259 |
-
return json.loads(content)
|
260 |
-
except json.JSONDecodeError:
|
261 |
-
return {"content": content,
|
262 |
-
"warning": "Could not parse response as valid JSON. Returning raw text."}
|
263 |
-
else:
|
264 |
-
return {"content": content} # return raw text
|
265 |
-
except requests.exceptions.RequestException as e:
|
266 |
-
self._log_error(f"API Request Error: {str(e)}")
|
267 |
-
return {"error": str(e), "content": ""}
|
268 |
-
except Exception as e:
|
269 |
-
self._log_error(f"Generation Error: {str(e)}")
|
270 |
-
return {"error": str(e), "content": ""}
|
271 |
|
272 |
-
def _failover_generation(self, prompt: str) -> Dict[str, Any]:
|
273 |
-
"""
|
|
|
|
|
|
|
274 |
for backup_provider in self.available_providers:
|
275 |
-
if backup_provider
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
|
282 |
# --- Input Processors ---
|
|
|
|
|
|
|
|
|
283 |
def _process_pdf(self, file) -> str:
|
284 |
-
"""Advanced PDF processing with OCR fallback"""
|
285 |
try:
|
286 |
with pdfplumber.open(file) as pdf:
|
287 |
return "\n".join(page.extract_text() or "" for page in pdf.pages)
|
@@ -289,268 +206,129 @@ class SyntheticDataGenerator:
|
|
289 |
self._log_error(f"PDF Processing Error: {str(e)}")
|
290 |
return ""
|
291 |
|
292 |
-
def _process_web(self, url: str) -> str:
|
293 |
-
"""Web content extraction with anti-bot measures"""
|
294 |
-
try:
|
295 |
-
response = requests.get(url, headers={
|
296 |
-
"User-Agent": "Mozilla/5.0 (compatible; SyntheticBot/1.0)"
|
297 |
-
}, timeout=10)
|
298 |
-
response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
|
299 |
-
return response.text
|
300 |
-
except requests.exceptions.RequestException as e:
|
301 |
-
self._log_error(f"Web Extraction Error: {str(e)}")
|
302 |
-
return ""
|
303 |
-
except Exception as e:
|
304 |
-
self._log_error(f"Unexpected Web Extraction Error: {str(e)}")
|
305 |
-
return ""
|
306 |
-
|
307 |
def _process_csv(self, file) -> str:
|
308 |
-
"""Process CSV
|
309 |
try:
|
310 |
df = pd.read_csv(file)
|
311 |
-
|
312 |
-
return df.to_string()
|
313 |
except Exception as e:
|
314 |
self._log_error(f"CSV Processing Error: {str(e)}")
|
315 |
return ""
|
316 |
|
317 |
-
def
|
318 |
-
"""
|
319 |
-
return text
|
320 |
-
|
321 |
-
def _process_prompt_template(self, file) -> str:
|
322 |
-
"""Process prompt template file and store the content in session_state"""
|
323 |
try:
|
324 |
-
|
325 |
-
|
326 |
-
|
|
|
327 |
except Exception as e:
|
328 |
-
self._log_error(f"Prompt Template Processing Error: {str(e)}")
|
329 |
-
return ""
|
330 |
-
|
331 |
-
def _process_image(self, image_file) -> list:
|
332 |
-
"""Processes image files for multimodal generation (Google Gemini)"""
|
333 |
-
try:
|
334 |
-
image_data = image_file.read()
|
335 |
-
image_part = Part.from_data(image_data, mime_type=image_file.type) # Use Part for google
|
336 |
-
return [image_part] # Return a list with the image part as a Google Part object
|
337 |
-
|
338 |
-
except Exception as e:
|
339 |
-
self._log_error(f"Image Processing Error: {str(e)}")
|
340 |
-
return []
|
341 |
-
|
342 |
-
def _process_api(self, url: str, method="GET", headers: Optional[Dict[str, str]] = None,
|
343 |
-
data: Optional[Dict[str, Any]] = None) -> str:
|
344 |
-
"""Generic API endpoint processor with configurable methods and headers."""
|
345 |
-
try:
|
346 |
-
if method.upper() == "GET":
|
347 |
-
response = requests.get(url, headers=headers or {},
|
348 |
-
timeout=st.session_state["api_call_timeout"])
|
349 |
-
elif method.upper() == "POST":
|
350 |
-
response = requests.post(url, headers=headers or {}, json=data,
|
351 |
-
timeout=st.session_state["api_call_timeout"])
|
352 |
-
else:
|
353 |
-
raise ValueError("Unsupported HTTP method.")
|
354 |
-
response.raise_for_status() # Raise HTTPError for bad responses
|
355 |
-
|
356 |
-
try:
|
357 |
-
return json.dumps(response.json(), indent=2)
|
358 |
-
except json.JSONDecodeError:
|
359 |
-
return response.text
|
360 |
-
except requests.exceptions.RequestException as e:
|
361 |
self._log_error(f"API Processing Error: {str(e)}")
|
362 |
return ""
|
363 |
-
except Exception as e:
|
364 |
-
self._log_error(f"Unexpected API Processing Error: {str(e)}")
|
365 |
-
return ""
|
366 |
|
367 |
-
def _process_database(self,
|
368 |
-
"""
|
|
|
|
|
|
|
369 |
try:
|
|
|
|
|
|
|
|
|
370 |
engine = sqlalchemy.create_engine(connection_string)
|
371 |
with engine.connect() as connection:
|
372 |
-
result =
|
373 |
-
|
374 |
-
return df.to_string()
|
375 |
except Exception as e:
|
376 |
self._log_error(f"Database Processing Error: {str(e)}")
|
377 |
return ""
|
378 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
379 |
# --- Enterprise Features ---
|
380 |
def _log_error(self, message: str) -> None:
|
381 |
-
"""Centralized error logging with telemetry"""
|
382 |
-
st.session_state
|
383 |
-
st.session_state
|
384 |
-
|
385 |
-
if st.session_state.debug_mode:
|
386 |
st.error(f"[DEBUG] {message}")
|
387 |
|
388 |
def health_check(self) -> Dict[str, Any]:
|
389 |
-
"""Comprehensive system diagnostics"""
|
390 |
return {
|
391 |
"providers_available": self.available_providers,
|
392 |
"api_connectivity": {
|
393 |
provider: self._test_provider_connectivity(provider)
|
394 |
for provider in self.available_providers
|
395 |
},
|
396 |
-
"system_metrics": st.session_state
|
397 |
}
|
398 |
|
399 |
def _test_provider_connectivity(self, provider: str) -> bool:
|
400 |
-
"""Provider-specific connectivity test"""
|
401 |
try:
|
402 |
client = self._get_client(provider)
|
403 |
if provider == "HuggingFace":
|
404 |
-
|
405 |
-
|
406 |
-
headers=client["headers"],
|
407 |
-
timeout=5
|
408 |
-
)
|
409 |
return response.status_code == 200
|
410 |
-
elif provider == "OpenAI":
|
411 |
-
try:
|
412 |
-
client.models.list()
|
413 |
-
return True
|
414 |
-
except Exception:
|
415 |
-
return False
|
416 |
-
elif provider == "Groq":
|
417 |
-
try:
|
418 |
-
client.models.list()
|
419 |
-
return True
|
420 |
-
except Exception:
|
421 |
-
return False
|
422 |
-
elif provider == "Google":
|
423 |
-
try:
|
424 |
-
if not st.session_state.google_configured: # Check if google has been configured
|
425 |
-
|
426 |
-
api_key = st.session_state.api_keys.get("Google",
|
427 |
-
"") # Get Key from session state
|
428 |
-
|
429 |
-
if not api_key: # If that is not set, check environment variable.
|
430 |
-
api_key = os.environ.get("GOOGLE_API_KEY")
|
431 |
-
|
432 |
-
if not api_key:
|
433 |
-
return False # Cant test API if no API Key
|
434 |
-
|
435 |
-
configure(api_key=api_key) # Configure API Key
|
436 |
-
st.session_state.google_configured = True
|
437 |
-
|
438 |
-
genai.GenerativeModel(model_name=self.PROVIDER_CONFIG["Google"]["models"][0]).generate_content(
|
439 |
-
"test") # Test a generation
|
440 |
-
return True
|
441 |
-
|
442 |
-
except Exception as e: # Catch any exceptions
|
443 |
-
print(e)
|
444 |
-
return False
|
445 |
-
|
446 |
else:
|
447 |
-
|
448 |
-
|
|
|
449 |
except Exception:
|
450 |
return False
|
451 |
|
|
|
452 |
# --- Enterprise UI Components ---
|
453 |
-
def provider_config_ui(gen: SyntheticDataGenerator):
|
454 |
-
"""Advanced provider configuration interface"""
|
455 |
with st.sidebar:
|
456 |
st.header("⚙️ AI Engine Configuration")
|
457 |
-
|
458 |
-
# Provider selection with availability checks
|
459 |
provider = st.selectbox(
|
460 |
"AI Provider",
|
461 |
gen.available_providers,
|
462 |
help="Available providers based on system configuration",
|
|
|
463 |
)
|
464 |
-
st.session_state
|
465 |
|
466 |
-
# API key management
|
467 |
api_key = st.text_input(
|
468 |
f"{provider} API Key",
|
469 |
type="password",
|
470 |
-
value=st.session_state
|
471 |
-
help=f"Obtain API key from {provider} portal"
|
472 |
)
|
473 |
-
st.session_state
|
474 |
|
475 |
-
# Model selection
|
476 |
model = st.selectbox(
|
477 |
"Model",
|
478 |
gen.PROVIDER_CONFIG[provider]["models"],
|
479 |
-
help="Select model version based on your API plan"
|
480 |
)
|
481 |
-
|
482 |
-
|
483 |
-
# Advanced options
|
484 |
-
st.subheader("Advanced Options")
|
485 |
-
st.session_state["temperature"] = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE, 0.05)
|
486 |
-
st.session_state["max_tokens"] = st.number_input("Max Tokens", 50, 4000, DEFAULT_MAX_TOKENS, 50)
|
487 |
-
st.session_state["use_streaming"] = st.checkbox("Enable Streaming")
|
488 |
-
st.session_state["api_call_timeout"] = st.slider("API Call Timeout (seconds)", 5, 60, API_TIMEOUT, 5)
|
489 |
-
|
490 |
-
# Google Specific Options
|
491 |
-
if provider == "Google":
|
492 |
-
st.subheader("Google Specific Settings")
|
493 |
-
st.session_state["top_p"] = st.slider("Top P", 0.0, 1.0, 0.95, 0.05, help="Nucleus sampling: Considers the most probable tokens.")
|
494 |
-
st.session_state["top_k"] = st.slider("Top K", 1, 100, 40, 1, help="Considers the top K most probable tokens.")
|
495 |
-
|
496 |
-
# Safety Settings Configuration
|
497 |
-
st.subheader("Safety Settings")
|
498 |
-
safety_categories = ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]
|
499 |
-
threshold_options = ["BLOCK_NONE", "BLOCK_LOW_AND_ABOVE", "BLOCK_MEDIUM_AND_ABOVE", "BLOCK_ONLY_HIGH",]
|
500 |
-
|
501 |
-
for category in safety_categories:
|
502 |
-
threshold = st.selectbox(f"Threshold for {category}", options=threshold_options, index=2, key=f"{category}_threshold") # Start with Medium and Above
|
503 |
-
#Update Threshold
|
504 |
-
for setting in st.session_state["safety_settings"]:
|
505 |
-
if setting["category"] == category:
|
506 |
-
setting["threshold"] = threshold
|
507 |
-
break
|
508 |
-
|
509 |
-
|
510 |
-
# System monitoring
|
511 |
if st.button("Run Health Check"):
|
512 |
report = gen.health_check()
|
513 |
st.json(report)
|
514 |
|
515 |
-
|
516 |
-
|
517 |
-
|
518 |
-
["Text", "PDF", "Web URL", "CSV", "Prompt Template",
|
519 |
-
"Image"]) # Add Image input, Add Structured Prompt (Advanced)
|
520 |
-
|
521 |
-
input_content = None
|
522 |
-
|
523 |
-
if input_method == "Text":
|
524 |
-
input_content = st.text_area("Enter Text", height=200)
|
525 |
-
elif input_method == "PDF":
|
526 |
-
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
|
527 |
-
if uploaded_file is not None:
|
528 |
-
input_content = uploaded_file
|
529 |
-
elif input_method == "Web URL":
|
530 |
-
url = st.text_input("Enter Web URL")
|
531 |
-
input_content = url
|
532 |
-
elif input_method == "CSV":
|
533 |
-
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
|
534 |
-
if uploaded_file is not None:
|
535 |
-
input_content = uploaded_file
|
536 |
-
elif input_method == "Prompt Template":
|
537 |
-
uploaded_file = st.file_uploader("Upload a Prompt Template file", type=["txt", "j2"])
|
538 |
-
if uploaded_file is not None:
|
539 |
-
input_content = uploaded_file
|
540 |
-
elif input_method == "Image":
|
541 |
-
uploaded_file = st.file_uploader("Upload an Image file", type=["png", "jpg", "jpeg"])
|
542 |
-
if uploaded_file is not None:
|
543 |
-
input_content = uploaded_file
|
544 |
-
|
545 |
-
return input_method, input_content
|
546 |
-
|
547 |
-
|
548 |
-
def main():
|
549 |
-
"""Enterprise-grade user interface"""
|
550 |
st.set_page_config(
|
551 |
page_title="Synthetic Data Factory Pro",
|
552 |
page_icon="🏭",
|
553 |
-
layout="wide"
|
554 |
)
|
555 |
|
556 |
gen = SyntheticDataGenerator()
|
@@ -558,56 +336,55 @@ def main():
|
|
558 |
st.title("🏭 Synthetic Data Factory Pro")
|
559 |
st.markdown(
|
560 |
"""
|
561 |
-
|
562 |
-
|
563 |
-
|
564 |
)
|
565 |
|
566 |
provider_config_ui(gen)
|
567 |
|
568 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
569 |
|
570 |
-
if st.button("Generate Data"):
|
571 |
-
if input_content:
|
572 |
-
try:
|
573 |
-
if input_method == "Text":
|
574 |
-
processed_input = gen._process_text(input_content)
|
575 |
-
elif input_method == "PDF":
|
576 |
-
processed_input = gen._process_pdf(input_content)
|
577 |
-
elif input_method == "Web URL":
|
578 |
-
processed_input = gen._process_web(input_content)
|
579 |
-
elif input_method == "CSV":
|
580 |
-
processed_input = gen._process_csv(input_content)
|
581 |
-
elif input_method == "Prompt Template":
|
582 |
-
processed_input = gen._process_prompt_template(
|
583 |
-
input_content) # Process the uploaded template
|
584 |
-
elif input_method == "Image":
|
585 |
-
processed_input = gen._process_image(input_content) # Returns a List of Parts
|
586 |
-
|
587 |
-
# If a prompt template is loaded, use it.
|
588 |
-
if st.session_state["prompt_template"] is not None and input_method != "Prompt Template":
|
589 |
-
try:
|
590 |
-
from jinja2 import Template # Conditionally import it.
|
591 |
-
|
592 |
-
template = Template(st.session_state["prompt_template"]) # Load Jinja2 Template
|
593 |
-
processed_input = template.render(
|
594 |
-
input=processed_input) # Render the template - Overwrites the Input, Google needs parts, not text
|
595 |
-
|
596 |
-
except Exception as e:
|
597 |
-
st.error(f"Error rendering prompt template: {e}")
|
598 |
-
st.stop() # Stop the app if template rendering fails
|
599 |
-
|
600 |
-
if processed_input:
|
601 |
-
result = gen.generate(st.session_state.active_provider, st.session_state.active_model,
|
602 |
-
processed_input)
|
603 |
-
st.subheader("Generated Output:")
|
604 |
-
st.json(result)
|
605 |
-
else:
|
606 |
-
st.warning("No data to process. Please check your input.")
|
607 |
-
except Exception as e:
|
608 |
-
st.error(f"An unexpected error occurred: {e}")
|
609 |
-
else:
|
610 |
-
st.warning("Please provide input data.")
|
611 |
|
612 |
if __name__ == "__main__":
|
613 |
-
main()
|
|
|
1 |
+
import json
|
2 |
import requests
|
3 |
import streamlit as st
|
4 |
import pdfplumber
|
|
|
6 |
import sqlalchemy
|
7 |
from typing import Any, Dict, List, Optional, Union
|
8 |
from functools import lru_cache
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
|
10 |
# Provider clients with import guards
|
11 |
try:
|
|
|
18 |
except ImportError:
|
19 |
groq = None
|
20 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
|
22 |
class SyntheticDataGenerator:
|
23 |
"""World's Most Advanced Synthetic Data Generation System"""
|
24 |
|
25 |
+
PROVIDER_CONFIG: Dict[str, Dict[str, Union[str, List[str], Optional[str]]]] = {
|
26 |
"Deepseek": {
|
27 |
"base_url": "https://api.deepseek.com/v1",
|
28 |
"models": ["deepseek-chat"],
|
29 |
"requires_library": "openai",
|
|
|
30 |
},
|
31 |
"OpenAI": {
|
32 |
"base_url": "https://api.openai.com/v1",
|
33 |
"models": ["gpt-4-turbo", "gpt-3.5-turbo"],
|
34 |
"requires_library": "openai",
|
|
|
35 |
},
|
36 |
"Groq": {
|
37 |
"base_url": "https://api.groq.com/openai/v1",
|
38 |
"models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
|
39 |
"requires_library": "groq",
|
|
|
40 |
},
|
41 |
"HuggingFace": {
|
42 |
"base_url": "https://api-inference.huggingface.co/models/",
|
43 |
"models": ["gpt2", "llama-2-13b-chat"],
|
44 |
"requires_library": None,
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
},
|
46 |
}
|
47 |
|
48 |
+
def __init__(self) -> None:
|
49 |
+
"""Initialize session state, input handlers, and providers."""
|
50 |
self._init_session_state()
|
51 |
self._setup_input_handlers()
|
52 |
self._setup_providers()
|
53 |
|
54 |
+
def _init_session_state(self) -> None:
|
55 |
+
"""Initialize enterprise-grade session management with default values."""
|
56 |
defaults = {
|
57 |
"active_provider": "OpenAI",
|
58 |
"api_keys": {},
|
|
|
63 |
"tokens_used": 0,
|
64 |
"error_count": 0,
|
65 |
},
|
66 |
+
"error_logs": [],
|
67 |
"debug_mode": False,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
}
|
69 |
for key, val in defaults.items():
|
70 |
if key not in st.session_state:
|
71 |
st.session_state[key] = val
|
72 |
|
73 |
+
def _setup_providers(self) -> None:
|
74 |
+
"""Configure available providers with health checks."""
|
75 |
+
self.available_providers: List[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
for provider, config in self.PROVIDER_CONFIG.items():
|
77 |
+
required_lib = config.get("requires_library")
|
78 |
+
if required_lib and not globals().get(required_lib.title()):
|
79 |
continue # Skip providers with missing dependencies
|
80 |
self.available_providers.append(provider)
|
81 |
|
82 |
+
def _setup_input_handlers(self) -> None:
|
83 |
+
"""Register enterprise input processors."""
|
84 |
self.input_processors = {
|
85 |
"text": self._process_text,
|
86 |
"pdf": self._process_pdf,
|
|
|
88 |
"api": self._process_api,
|
89 |
"database": self._process_database,
|
90 |
"web": self._process_web,
|
|
|
|
|
91 |
}
|
92 |
|
|
|
93 |
@lru_cache(maxsize=100)
|
94 |
+
def generate(self, provider: str, model: str, prompt: str) -> Dict[str, Any]:
|
95 |
+
"""
|
96 |
+
Unified generation endpoint with failover support.
|
97 |
+
Caches results to improve performance.
|
98 |
+
"""
|
99 |
try:
|
100 |
if provider not in self.available_providers:
|
101 |
+
raise ValueError(f"Provider {provider} is not available.")
|
102 |
|
103 |
client = self._get_client(provider)
|
104 |
if not client:
|
105 |
+
raise ConnectionError("Client initialization failed.")
|
106 |
|
107 |
return self._execute_generation(client, provider, model, prompt)
|
108 |
|
109 |
except Exception as e:
|
110 |
+
self._log_error(f"Generation Error with provider '{provider}': {str(e)}")
|
111 |
+
return self._failover_generation(provider, model, prompt)
|
112 |
|
113 |
def _get_client(self, provider: str) -> Any:
|
114 |
+
"""
|
115 |
+
Secure client initialization with connection pooling.
|
116 |
+
Raises ValueError if API key is missing.
|
117 |
+
"""
|
118 |
config = self.PROVIDER_CONFIG[provider]
|
119 |
+
api_key = st.session_state["api_keys"].get(provider, "")
|
120 |
|
121 |
+
if not api_key:
|
122 |
+
raise ValueError(f"API key required for provider {provider}.")
|
123 |
|
124 |
try:
|
125 |
if provider == "Groq":
|
126 |
return groq.Groq(api_key=api_key)
|
127 |
elif provider == "HuggingFace":
|
128 |
return {"headers": {"Authorization": f"Bearer {api_key}"}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
129 |
else:
|
130 |
+
# Initialize OpenAI client for OpenAI and Deepseek
|
131 |
return OpenAI(
|
132 |
base_url=config["base_url"],
|
133 |
api_key=api_key,
|
134 |
+
timeout=30
|
135 |
)
|
136 |
except Exception as e:
|
137 |
+
self._log_error(f"Client Initialization Failed for {provider}: {str(e)}")
|
138 |
return None
|
139 |
|
140 |
+
def _execute_generation(self, client: Any, provider: str, model: str, prompt: str) -> Dict[str, Any]:
|
141 |
+
"""
|
142 |
+
Execute provider-specific generation with circuit breaker.
|
143 |
+
Increments API call and token usage metrics.
|
144 |
+
"""
|
145 |
+
st.session_state["system_metrics"]["api_calls"] += 1
|
146 |
+
|
147 |
+
if provider == "HuggingFace":
|
148 |
+
url = self.PROVIDER_CONFIG[provider]["base_url"] + model
|
149 |
+
response = requests.post(
|
150 |
+
url,
|
151 |
+
headers=client["headers"],
|
152 |
+
json={"inputs": prompt},
|
153 |
+
timeout=30
|
154 |
+
)
|
155 |
+
response.raise_for_status()
|
156 |
+
return response.json()
|
157 |
+
else:
|
158 |
+
completion = client.chat.completions.create(
|
159 |
+
model=model,
|
160 |
+
messages=[{"role": "user", "content": prompt}],
|
161 |
+
temperature=0.1,
|
162 |
+
max_tokens=2000
|
163 |
+
)
|
164 |
+
# Update token usage if available
|
165 |
+
if hasattr(completion.usage, "total_tokens"):
|
166 |
+
st.session_state["system_metrics"]["tokens_used"] += completion.usage.total_tokens
|
167 |
+
try:
|
168 |
+
result = json.loads(completion.choices[0].message.content)
|
169 |
+
except json.JSONDecodeError:
|
170 |
+
result = {"response": completion.choices[0].message.content}
|
171 |
+
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
172 |
|
173 |
+
def _failover_generation(self, original_provider: str, model: str, prompt: str) -> Dict[str, Any]:
|
174 |
+
"""
|
175 |
+
Enterprise failover to secondary providers.
|
176 |
+
Tries available providers (excluding the original) until one succeeds.
|
177 |
+
"""
|
178 |
for backup_provider in self.available_providers:
|
179 |
+
if backup_provider == original_provider:
|
180 |
+
continue
|
181 |
+
# Determine model to use: if the desired model is available, use it; otherwise use the first available model.
|
182 |
+
backup_models = self.PROVIDER_CONFIG[backup_provider]["models"]
|
183 |
+
backup_model = model if model in backup_models else backup_models[0]
|
184 |
+
try:
|
185 |
+
st.session_state["active_provider"] = backup_provider
|
186 |
+
result = self.generate(backup_provider, backup_model, prompt)
|
187 |
+
# Optionally, log the failover event
|
188 |
+
self._log_error(f"Failover succeeded with provider '{backup_provider}' using model '{backup_model}'.")
|
189 |
+
return result
|
190 |
+
except Exception as e:
|
191 |
+
self._log_error(f"Failover attempt with provider '{backup_provider}' failed: {str(e)}")
|
192 |
+
continue
|
193 |
+
raise RuntimeError("All generation providers are unavailable.")
|
194 |
|
195 |
# --- Input Processors ---
|
196 |
+
def _process_text(self, text: str) -> str:
|
197 |
+
"""Process plain text input by stripping unnecessary whitespace."""
|
198 |
+
return text.strip()
|
199 |
+
|
200 |
def _process_pdf(self, file) -> str:
|
201 |
+
"""Advanced PDF processing with OCR fallback."""
|
202 |
try:
|
203 |
with pdfplumber.open(file) as pdf:
|
204 |
return "\n".join(page.extract_text() or "" for page in pdf.pages)
|
|
|
206 |
self._log_error(f"PDF Processing Error: {str(e)}")
|
207 |
return ""
|
208 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
209 |
def _process_csv(self, file) -> str:
|
210 |
+
"""Process CSV input by reading it into a DataFrame and converting to CSV string."""
|
211 |
try:
|
212 |
df = pd.read_csv(file)
|
213 |
+
return df.to_csv(index=False)
|
|
|
214 |
except Exception as e:
|
215 |
self._log_error(f"CSV Processing Error: {str(e)}")
|
216 |
return ""
|
217 |
|
218 |
+
def _process_api(self, api_url: str) -> str:
|
219 |
+
"""Process API input by fetching JSON data from an endpoint."""
|
|
|
|
|
|
|
|
|
220 |
try:
|
221 |
+
response = requests.get(api_url, timeout=10)
|
222 |
+
response.raise_for_status()
|
223 |
+
data = response.json()
|
224 |
+
return json.dumps(data, indent=2)
|
225 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
self._log_error(f"API Processing Error: {str(e)}")
|
227 |
return ""
|
|
|
|
|
|
|
228 |
|
229 |
+
def _process_database(self, config: Dict[str, str]) -> str:
|
230 |
+
"""
|
231 |
+
Process database input by executing a query.
|
232 |
+
Expects a configuration dictionary with 'connection_string' and 'query' keys.
|
233 |
+
"""
|
234 |
try:
|
235 |
+
connection_string = config.get("connection_string", "")
|
236 |
+
query = config.get("query", "")
|
237 |
+
if not connection_string or not query:
|
238 |
+
raise ValueError("Missing connection string or query.")
|
239 |
engine = sqlalchemy.create_engine(connection_string)
|
240 |
with engine.connect() as connection:
|
241 |
+
result = pd.read_sql(query, connection)
|
242 |
+
return result.to_csv(index=False)
|
|
|
243 |
except Exception as e:
|
244 |
self._log_error(f"Database Processing Error: {str(e)}")
|
245 |
return ""
|
246 |
|
247 |
+
def _process_web(self, url: str) -> str:
|
248 |
+
"""Web content extraction with anti-bot measures."""
|
249 |
+
try:
|
250 |
+
response = requests.get(url, headers={
|
251 |
+
"User-Agent": "Mozilla/5.0 (compatible; SyntheticBot/1.0)"
|
252 |
+
}, timeout=10)
|
253 |
+
response.raise_for_status()
|
254 |
+
return response.text
|
255 |
+
except Exception as e:
|
256 |
+
self._log_error(f"Web Extraction Error: {str(e)}")
|
257 |
+
return ""
|
258 |
+
|
259 |
# --- Enterprise Features ---
|
260 |
def _log_error(self, message: str) -> None:
|
261 |
+
"""Centralized error logging with telemetry."""
|
262 |
+
st.session_state["system_metrics"]["error_count"] += 1
|
263 |
+
st.session_state["error_logs"].append(message)
|
264 |
+
if st.session_state.get("debug_mode"):
|
|
|
265 |
st.error(f"[DEBUG] {message}")
|
266 |
|
267 |
def health_check(self) -> Dict[str, Any]:
|
268 |
+
"""Comprehensive system diagnostics."""
|
269 |
return {
|
270 |
"providers_available": self.available_providers,
|
271 |
"api_connectivity": {
|
272 |
provider: self._test_provider_connectivity(provider)
|
273 |
for provider in self.available_providers
|
274 |
},
|
275 |
+
"system_metrics": st.session_state["system_metrics"],
|
276 |
}
|
277 |
|
278 |
def _test_provider_connectivity(self, provider: str) -> bool:
|
279 |
+
"""Provider-specific connectivity test."""
|
280 |
try:
|
281 |
client = self._get_client(provider)
|
282 |
if provider == "HuggingFace":
|
283 |
+
url = self.PROVIDER_CONFIG[provider]["base_url"]
|
284 |
+
response = requests.get(url, headers=client["headers"], timeout=5)
|
|
|
|
|
|
|
285 |
return response.status_code == 200
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
286 |
else:
|
287 |
+
# Assuming client has a models.list() method for other providers.
|
288 |
+
client.models.list()
|
289 |
+
return True
|
290 |
except Exception:
|
291 |
return False
|
292 |
|
293 |
+
|
294 |
# --- Enterprise UI Components ---
|
295 |
+
def provider_config_ui(gen: SyntheticDataGenerator) -> None:
|
296 |
+
"""Advanced provider configuration interface."""
|
297 |
with st.sidebar:
|
298 |
st.header("⚙️ AI Engine Configuration")
|
|
|
|
|
299 |
provider = st.selectbox(
|
300 |
"AI Provider",
|
301 |
gen.available_providers,
|
302 |
help="Available providers based on system configuration",
|
303 |
+
index=gen.available_providers.index(st.session_state.get("active_provider", "OpenAI"))
|
304 |
)
|
305 |
+
st.session_state["active_provider"] = provider
|
306 |
|
|
|
307 |
api_key = st.text_input(
|
308 |
f"{provider} API Key",
|
309 |
type="password",
|
310 |
+
value=st.session_state["api_keys"].get(provider, ""),
|
311 |
+
help=f"Obtain API key from {provider}'s portal"
|
312 |
)
|
313 |
+
st.session_state["api_keys"][provider] = api_key
|
314 |
|
|
|
315 |
model = st.selectbox(
|
316 |
"Model",
|
317 |
gen.PROVIDER_CONFIG[provider]["models"],
|
318 |
+
help="Select model version based on your API plan"
|
319 |
)
|
320 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
321 |
if st.button("Run Health Check"):
|
322 |
report = gen.health_check()
|
323 |
st.json(report)
|
324 |
|
325 |
+
|
326 |
+
def main() -> None:
|
327 |
+
"""Enterprise-grade user interface."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
st.set_page_config(
|
329 |
page_title="Synthetic Data Factory Pro",
|
330 |
page_icon="🏭",
|
331 |
+
layout="wide"
|
332 |
)
|
333 |
|
334 |
gen = SyntheticDataGenerator()
|
|
|
336 |
st.title("🏭 Synthetic Data Factory Pro")
|
337 |
st.markdown(
|
338 |
"""
|
339 |
+
**World's Most Advanced Synthetic Data Generation Platform**
|
340 |
+
*Multi-provider AI Engine | Enterprise Input Processors | Real-time Monitoring*
|
341 |
+
"""
|
342 |
)
|
343 |
|
344 |
provider_config_ui(gen)
|
345 |
|
346 |
+
# Example: Input management and processing
|
347 |
+
st.subheader("Input Data")
|
348 |
+
input_type = st.selectbox("Select Input Type", list(gen.input_processors.keys()))
|
349 |
+
if input_type == "text":
|
350 |
+
user_input = st.text_area("Enter your text here:")
|
351 |
+
elif input_type == "pdf":
|
352 |
+
user_input = st.file_uploader("Upload a PDF file", type=["pdf"])
|
353 |
+
elif input_type == "csv":
|
354 |
+
user_input = st.file_uploader("Upload a CSV file", type=["csv"])
|
355 |
+
elif input_type == "api":
|
356 |
+
user_input = st.text_input("Enter API URL:")
|
357 |
+
elif input_type == "database":
|
358 |
+
user_input = st.text_area("Enter Database Config as JSON (with 'connection_string' and 'query'):")
|
359 |
+
elif input_type == "web":
|
360 |
+
user_input = st.text_input("Enter Website URL:")
|
361 |
+
|
362 |
+
processed_input = ""
|
363 |
+
if st.button("Process Input"):
|
364 |
+
processor = gen.input_processors.get(input_type)
|
365 |
+
if processor:
|
366 |
+
if input_type in ["pdf", "csv"]:
|
367 |
+
processed_input = processor(user_input)
|
368 |
+
elif input_type == "database":
|
369 |
+
try:
|
370 |
+
db_config = json.loads(user_input)
|
371 |
+
processed_input = processor(db_config)
|
372 |
+
except json.JSONDecodeError:
|
373 |
+
st.error("Invalid JSON for database configuration.")
|
374 |
+
else:
|
375 |
+
processed_input = processor(user_input)
|
376 |
+
st.text_area("Processed Input", value=processed_input, height=200)
|
377 |
+
|
378 |
+
# Generation section
|
379 |
+
st.subheader("Generate Synthetic Data")
|
380 |
+
prompt = st.text_area("Enter your prompt for data generation:")
|
381 |
+
if st.button("Generate"):
|
382 |
+
active_provider = st.session_state.get("active_provider", "OpenAI")
|
383 |
+
# Allow model selection for the generation step
|
384 |
+
model = st.selectbox("Select Generation Model", gen.PROVIDER_CONFIG[active_provider]["models"])
|
385 |
+
result = gen.generate(active_provider, model, prompt)
|
386 |
+
st.json(result)
|
387 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
388 |
|
389 |
if __name__ == "__main__":
|
390 |
+
main()
|