mgbam commited on
Commit
4018394
·
verified ·
1 Parent(s): 68cb6a7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -415
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import requests
2
  import streamlit as st
3
  import pdfplumber
@@ -5,13 +6,6 @@ import pandas as pd
5
  import sqlalchemy
6
  from typing import Any, Dict, List, Optional, Union
7
  from functools import lru_cache
8
- import json # Explicit import
9
- import os
10
-
11
- # --- Constants ---
12
- DEFAULT_TEMPERATURE = 0.1
13
- DEFAULT_MAX_TOKENS = 2000
14
- API_TIMEOUT = 30
15
 
16
  # Provider clients with import guards
17
  try:
@@ -24,58 +18,41 @@ try:
24
  except ImportError:
25
  groq = None
26
 
27
- try:
28
- import google.generativeai as genai
29
- from google.generativeai import GenerativeModel, configure, Part
30
- except ImportError:
31
- GenerativeModel = None
32
- configure = None
33
- genai = None
34
- Part = None
35
-
36
 
37
  class SyntheticDataGenerator:
38
  """World's Most Advanced Synthetic Data Generation System"""
39
 
40
- PROVIDER_CONFIG = {
41
  "Deepseek": {
42
  "base_url": "https://api.deepseek.com/v1",
43
  "models": ["deepseek-chat"],
44
  "requires_library": "openai",
45
- "supports_json_output": True, # Indicate that the provider reliably returns JSON
46
  },
47
  "OpenAI": {
48
  "base_url": "https://api.openai.com/v1",
49
  "models": ["gpt-4-turbo", "gpt-3.5-turbo"],
50
  "requires_library": "openai",
51
- "supports_json_output": True,
52
  },
53
  "Groq": {
54
  "base_url": "https://api.groq.com/openai/v1",
55
  "models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
56
  "requires_library": "groq",
57
- "supports_json_output": True,
58
  },
59
  "HuggingFace": {
60
  "base_url": "https://api-inference.huggingface.co/models/",
61
  "models": ["gpt2", "llama-2-13b-chat"],
62
  "requires_library": None,
63
- "supports_json_output": False, # More likely to return text
64
- },
65
- "Google": {
66
- "models": ["gemini-1.5-flash-latest", "gemini-1.5-pro-latest", "gemini-pro", "gemini-pro-vision"],
67
- "requires_library": "google.generativeai",
68
- "supports_json_output": True
69
  },
70
  }
71
 
72
- def __init__(self):
 
73
  self._init_session_state()
74
  self._setup_input_handlers()
75
  self._setup_providers()
76
 
77
- def _init_session_state(self):
78
- """Initialize enterprise-grade session management"""
79
  defaults = {
80
  "active_provider": "OpenAI",
81
  "api_keys": {},
@@ -86,52 +63,24 @@ class SyntheticDataGenerator:
86
  "tokens_used": 0,
87
  "error_count": 0,
88
  },
 
89
  "debug_mode": False,
90
- "temperature": DEFAULT_TEMPERATURE, # Add temperature control
91
- "max_tokens": DEFAULT_MAX_TOKENS, # Add max token control
92
- "use_streaming": False, # Control Streaming behavior
93
- "prompt_template": None, # Support prompt templates
94
- "api_call_timeout": API_TIMEOUT, # API call timeout
95
- "image_parts": [], # Store image parts for multimodal generation
96
- "top_p": 0.95, # Default top_p for Google
97
- "top_k": 40, # Default top_k for Google
98
- "safety_settings": self._get_default_safety_settings(), #Default Safety Settings
99
  }
100
  for key, val in defaults.items():
101
  if key not in st.session_state:
102
  st.session_state[key] = val
103
 
104
- def _get_default_safety_settings(self):
105
- """Provides a default safety setting configuration for the Google provider"""
106
- return [
107
- {
108
- "category": "HARM_CATEGORY_HARASSMENT",
109
- "threshold": "BLOCK_MEDIUM_AND_ABOVE"
110
- },
111
- {
112
- "category": "HARM_CATEGORY_HATE_SPEECH",
113
- "threshold": "BLOCK_MEDIUM_AND_ABOVE"
114
- },
115
- {
116
- "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
117
- "threshold": "BLOCK_MEDIUM_AND_ABOVE"
118
- },
119
- {
120
- "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
121
- "threshold": "BLOCK_MEDIUM_AND_ABOVE"
122
- },
123
- ]
124
-
125
- def _setup_providers(self):
126
- """Configure available providers with health checks"""
127
- self.available_providers = []
128
  for provider, config in self.PROVIDER_CONFIG.items():
129
- if config["requires_library"] and not globals().get(config["requires_library"].split('.')[0].title()):
 
130
  continue # Skip providers with missing dependencies
131
  self.available_providers.append(provider)
132
 
133
- def _setup_input_handlers(self):
134
- """Register enterprise input processors"""
135
  self.input_processors = {
136
  "text": self._process_text,
137
  "pdf": self._process_pdf,
@@ -139,149 +88,117 @@ class SyntheticDataGenerator:
139
  "api": self._process_api,
140
  "database": self._process_database,
141
  "web": self._process_web,
142
- "prompt_template": self._process_prompt_template,
143
- "image": self._process_image,
144
  }
145
 
146
- # --- Core Generation Engine ---
147
  @lru_cache(maxsize=100)
148
- def generate(self, provider: str, model: str, prompt: Any) -> Dict[str, Any]:
149
- """Unified generation endpoint with failover support"""
 
 
 
150
  try:
151
  if provider not in self.available_providers:
152
- raise ValueError(f"Provider {provider} not available")
153
 
154
  client = self._get_client(provider)
155
  if not client:
156
- raise ConnectionError("Client initialization failed")
157
 
158
  return self._execute_generation(client, provider, model, prompt)
159
 
160
  except Exception as e:
161
- self._log_error(f"Generation Error: {str(e)}")
162
- return self._failover_generation(prompt)
163
 
164
  def _get_client(self, provider: str) -> Any:
165
- """Secure client initialization with connection pooling"""
 
 
 
166
  config = self.PROVIDER_CONFIG[provider]
167
- api_key = st.session_state.api_keys.get(provider, "")
168
 
169
- if not api_key and provider != "Google":
170
- raise ValueError(f"API key required for provider: {provider}")
171
 
172
  try:
173
  if provider == "Groq":
174
  return groq.Groq(api_key=api_key)
175
  elif provider == "HuggingFace":
176
  return {"headers": {"Authorization": f"Bearer {api_key}"}}
177
- elif provider == "Google":
178
- if not st.session_state.google_configured:
179
- if "GOOGLE_API_KEY" in os.environ:
180
- api_key = os.environ["GOOGLE_API_KEY"]
181
- else:
182
- api_key = st.session_state.api_keys.get("Google", "")
183
- if not api_key:
184
- raise ValueError(
185
- "Google API key is required. Please set it in the app or as the GOOGLE_API_KEY environment variable.")
186
- try:
187
- configure(api_key=api_key) # Moved configure into try block
188
- st.session_state.google_configured = True
189
- except Exception as e:
190
- raise ValueError(f"Error configuring Google API: {e}")
191
-
192
- generation_config = genai.GenerationConfig(
193
- temperature=st.session_state["temperature"],
194
- top_p=st.session_state["top_p"],
195
- top_k=st.session_state["top_k"],
196
- max_output_tokens=st.session_state["max_tokens"],
197
- )
198
- safety_settings = st.session_state["safety_settings"] #Get Safety Settings
199
-
200
- return GenerativeModel(model_name=model, generation_config=generation_config,
201
- safety_settings=safety_settings) # Use all settings
202
  else:
 
203
  return OpenAI(
204
  base_url=config["base_url"],
205
  api_key=api_key,
206
- timeout=st.session_state["api_call_timeout"], # Use session state timeout
207
  )
208
  except Exception as e:
209
- self._log_error(f"Client Init Failed: {str(e)}")
210
  return None
211
 
212
- def _execute_generation(self, client, provider: str, model: str, prompt: Any) -> Dict[str, Any]:
213
- """Execute provider-specific generation with circuit breaker"""
214
- st.session_state.system_metrics["api_calls"] += 1
215
-
216
- try:
217
- if provider == "HuggingFace":
218
- response = requests.post(
219
- self.PROVIDER_CONFIG[provider]["base_url"] + model,
220
- headers=client["headers"],
221
- json={"inputs": prompt},
222
- timeout=st.session_state["api_call_timeout"]
223
- )
224
- response.raise_for_status() # Raise HTTPError for bad responses
225
- return response.json()
226
- elif provider == "Google":
227
- # Construct parts list. If prompt is already a list, assume it contains Parts and text
228
-
229
- if isinstance(prompt, str):
230
- parts = [prompt] #If plain text
231
- else:
232
- parts = prompt #Multimodal prompt
233
-
234
- response = client.generate_content(parts) # Send parts to Google
235
-
236
- content = response.text
237
- if self.PROVIDER_CONFIG[provider]["supports_json_output"]:
238
- try:
239
- return json.loads(content)
240
- except json.JSONDecodeError:
241
- return {"content": content,
242
- "warning": "Could not parse response as valid JSON. Returning raw text."}
243
- else:
244
- return {"content": content} #Return raw text
245
-
246
- else:
247
- completion = client.chat.completions.create(
248
- model=model,
249
- messages=[{"role": "user", "content": prompt}],
250
- temperature=st.session_state["temperature"], # Get temperature from session
251
- max_tokens=st.session_state["max_tokens"], # Get max_tokens from session
252
- stream=st.session_state["use_streaming"], # Use streaming bool from session
253
- )
254
- st.session_state.system_metrics["tokens_used"] += completion.usage.total_tokens
255
- content = completion.choices[0].message.content
256
- # Attempt to parse JSON if supported, otherwise return text
257
- if self.PROVIDER_CONFIG[provider]["supports_json_output"]:
258
- try:
259
- return json.loads(content)
260
- except json.JSONDecodeError:
261
- return {"content": content,
262
- "warning": "Could not parse response as valid JSON. Returning raw text."}
263
- else:
264
- return {"content": content} # return raw text
265
- except requests.exceptions.RequestException as e:
266
- self._log_error(f"API Request Error: {str(e)}")
267
- return {"error": str(e), "content": ""}
268
- except Exception as e:
269
- self._log_error(f"Generation Error: {str(e)}")
270
- return {"error": str(e), "content": ""}
271
 
272
- def _failover_generation(self, prompt: str) -> Dict[str, Any]:
273
- """Enterprise failover to secondary providers"""
 
 
 
274
  for backup_provider in self.available_providers:
275
- if backup_provider != st.session_state.active_provider:
276
- try:
277
- return self.generate(backup_provider, ..., prompt=prompt)
278
- except Exception:
279
- continue
280
- raise RuntimeError("All generation providers unavailable")
 
 
 
 
 
 
 
 
 
281
 
282
  # --- Input Processors ---
 
 
 
 
283
  def _process_pdf(self, file) -> str:
284
- """Advanced PDF processing with OCR fallback"""
285
  try:
286
  with pdfplumber.open(file) as pdf:
287
  return "\n".join(page.extract_text() or "" for page in pdf.pages)
@@ -289,268 +206,129 @@ class SyntheticDataGenerator:
289
  self._log_error(f"PDF Processing Error: {str(e)}")
290
  return ""
291
 
292
- def _process_web(self, url: str) -> str:
293
- """Web content extraction with anti-bot measures"""
294
- try:
295
- response = requests.get(url, headers={
296
- "User-Agent": "Mozilla/5.0 (compatible; SyntheticBot/1.0)"
297
- }, timeout=10)
298
- response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
299
- return response.text
300
- except requests.exceptions.RequestException as e:
301
- self._log_error(f"Web Extraction Error: {str(e)}")
302
- return ""
303
- except Exception as e:
304
- self._log_error(f"Unexpected Web Extraction Error: {str(e)}")
305
- return ""
306
-
307
  def _process_csv(self, file) -> str:
308
- """Process CSV files and return as a string representation."""
309
  try:
310
  df = pd.read_csv(file)
311
- # Add more sophisticated CSV processing here, e.g., schema inference
312
- return df.to_string()
313
  except Exception as e:
314
  self._log_error(f"CSV Processing Error: {str(e)}")
315
  return ""
316
 
317
- def _process_text(self, text: str) -> str:
318
- """Simple text passthrough processor"""
319
- return text
320
-
321
- def _process_prompt_template(self, file) -> str:
322
- """Process prompt template file and store the content in session_state"""
323
  try:
324
- template_content = file.read().decode("utf-8") # Read file content
325
- st.session_state["prompt_template"] = template_content # Store in session_state
326
- return "Prompt template uploaded and stored." # Inform the user
 
327
  except Exception as e:
328
- self._log_error(f"Prompt Template Processing Error: {str(e)}")
329
- return ""
330
-
331
- def _process_image(self, image_file) -> list:
332
- """Processes image files for multimodal generation (Google Gemini)"""
333
- try:
334
- image_data = image_file.read()
335
- image_part = Part.from_data(image_data, mime_type=image_file.type) # Use Part for google
336
- return [image_part] # Return a list with the image part as a Google Part object
337
-
338
- except Exception as e:
339
- self._log_error(f"Image Processing Error: {str(e)}")
340
- return []
341
-
342
- def _process_api(self, url: str, method="GET", headers: Optional[Dict[str, str]] = None,
343
- data: Optional[Dict[str, Any]] = None) -> str:
344
- """Generic API endpoint processor with configurable methods and headers."""
345
- try:
346
- if method.upper() == "GET":
347
- response = requests.get(url, headers=headers or {},
348
- timeout=st.session_state["api_call_timeout"])
349
- elif method.upper() == "POST":
350
- response = requests.post(url, headers=headers or {}, json=data,
351
- timeout=st.session_state["api_call_timeout"])
352
- else:
353
- raise ValueError("Unsupported HTTP method.")
354
- response.raise_for_status() # Raise HTTPError for bad responses
355
-
356
- try:
357
- return json.dumps(response.json(), indent=2)
358
- except json.JSONDecodeError:
359
- return response.text
360
- except requests.exceptions.RequestException as e:
361
  self._log_error(f"API Processing Error: {str(e)}")
362
  return ""
363
- except Exception as e:
364
- self._log_error(f"Unexpected API Processing Error: {str(e)}")
365
- return ""
366
 
367
- def _process_database(self, connection_string: str, query: str) -> str:
368
- """Database query processor using SQLAlchemy."""
 
 
 
369
  try:
 
 
 
 
370
  engine = sqlalchemy.create_engine(connection_string)
371
  with engine.connect() as connection:
372
- result = connection.execute(sqlalchemy.text(query))
373
- df = pd.DataFrame(result.fetchall(), columns=result.keys())
374
- return df.to_string()
375
  except Exception as e:
376
  self._log_error(f"Database Processing Error: {str(e)}")
377
  return ""
378
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  # --- Enterprise Features ---
380
  def _log_error(self, message: str) -> None:
381
- """Centralized error logging with telemetry"""
382
- st.session_state.system_metrics["error_count"] += 1
383
- st.session_state.error_logs = st.session_state.get("error_logs", []) + [message]
384
-
385
- if st.session_state.debug_mode:
386
  st.error(f"[DEBUG] {message}")
387
 
388
  def health_check(self) -> Dict[str, Any]:
389
- """Comprehensive system diagnostics"""
390
  return {
391
  "providers_available": self.available_providers,
392
  "api_connectivity": {
393
  provider: self._test_provider_connectivity(provider)
394
  for provider in self.available_providers
395
  },
396
- "system_metrics": st.session_state.system_metrics,
397
  }
398
 
399
  def _test_provider_connectivity(self, provider: str) -> bool:
400
- """Provider-specific connectivity test"""
401
  try:
402
  client = self._get_client(provider)
403
  if provider == "HuggingFace":
404
- response = requests.get(
405
- self.PROVIDER_CONFIG[provider]["base_url"],
406
- headers=client["headers"],
407
- timeout=5
408
- )
409
  return response.status_code == 200
410
- elif provider == "OpenAI":
411
- try:
412
- client.models.list()
413
- return True
414
- except Exception:
415
- return False
416
- elif provider == "Groq":
417
- try:
418
- client.models.list()
419
- return True
420
- except Exception:
421
- return False
422
- elif provider == "Google":
423
- try:
424
- if not st.session_state.google_configured: # Check if google has been configured
425
-
426
- api_key = st.session_state.api_keys.get("Google",
427
- "") # Get Key from session state
428
-
429
- if not api_key: # If that is not set, check environment variable.
430
- api_key = os.environ.get("GOOGLE_API_KEY")
431
-
432
- if not api_key:
433
- return False # Cant test API if no API Key
434
-
435
- configure(api_key=api_key) # Configure API Key
436
- st.session_state.google_configured = True
437
-
438
- genai.GenerativeModel(model_name=self.PROVIDER_CONFIG["Google"]["models"][0]).generate_content(
439
- "test") # Test a generation
440
- return True
441
-
442
- except Exception as e: # Catch any exceptions
443
- print(e)
444
- return False
445
-
446
  else:
447
- return False # Unknown provider
448
-
 
449
  except Exception:
450
  return False
451
 
 
452
  # --- Enterprise UI Components ---
453
- def provider_config_ui(gen: SyntheticDataGenerator):
454
- """Advanced provider configuration interface"""
455
  with st.sidebar:
456
  st.header("⚙️ AI Engine Configuration")
457
-
458
- # Provider selection with availability checks
459
  provider = st.selectbox(
460
  "AI Provider",
461
  gen.available_providers,
462
  help="Available providers based on system configuration",
 
463
  )
464
- st.session_state.active_provider = provider
465
 
466
- # API key management
467
  api_key = st.text_input(
468
  f"{provider} API Key",
469
  type="password",
470
- value=st.session_state.api_keys.get(provider, ""),
471
- help=f"Obtain API key from {provider} portal",
472
  )
473
- st.session_state.api_keys[provider] = api_key
474
 
475
- # Model selection
476
  model = st.selectbox(
477
  "Model",
478
  gen.PROVIDER_CONFIG[provider]["models"],
479
- help="Select model version based on your API plan",
480
  )
481
- st.session_state.active_model = model
482
-
483
- # Advanced options
484
- st.subheader("Advanced Options")
485
- st.session_state["temperature"] = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE, 0.05)
486
- st.session_state["max_tokens"] = st.number_input("Max Tokens", 50, 4000, DEFAULT_MAX_TOKENS, 50)
487
- st.session_state["use_streaming"] = st.checkbox("Enable Streaming")
488
- st.session_state["api_call_timeout"] = st.slider("API Call Timeout (seconds)", 5, 60, API_TIMEOUT, 5)
489
-
490
- # Google Specific Options
491
- if provider == "Google":
492
- st.subheader("Google Specific Settings")
493
- st.session_state["top_p"] = st.slider("Top P", 0.0, 1.0, 0.95, 0.05, help="Nucleus sampling: Considers the most probable tokens.")
494
- st.session_state["top_k"] = st.slider("Top K", 1, 100, 40, 1, help="Considers the top K most probable tokens.")
495
-
496
- # Safety Settings Configuration
497
- st.subheader("Safety Settings")
498
- safety_categories = ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]
499
- threshold_options = ["BLOCK_NONE", "BLOCK_LOW_AND_ABOVE", "BLOCK_MEDIUM_AND_ABOVE", "BLOCK_ONLY_HIGH",]
500
-
501
- for category in safety_categories:
502
- threshold = st.selectbox(f"Threshold for {category}", options=threshold_options, index=2, key=f"{category}_threshold") # Start with Medium and Above
503
- #Update Threshold
504
- for setting in st.session_state["safety_settings"]:
505
- if setting["category"] == category:
506
- setting["threshold"] = threshold
507
- break
508
-
509
-
510
- # System monitoring
511
  if st.button("Run Health Check"):
512
  report = gen.health_check()
513
  st.json(report)
514
 
515
- def input_ui():
516
- """Creates the input method UI"""
517
- input_method = st.selectbox("Input Method",
518
- ["Text", "PDF", "Web URL", "CSV", "Prompt Template",
519
- "Image"]) # Add Image input, Add Structured Prompt (Advanced)
520
-
521
- input_content = None
522
-
523
- if input_method == "Text":
524
- input_content = st.text_area("Enter Text", height=200)
525
- elif input_method == "PDF":
526
- uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
527
- if uploaded_file is not None:
528
- input_content = uploaded_file
529
- elif input_method == "Web URL":
530
- url = st.text_input("Enter Web URL")
531
- input_content = url
532
- elif input_method == "CSV":
533
- uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
534
- if uploaded_file is not None:
535
- input_content = uploaded_file
536
- elif input_method == "Prompt Template":
537
- uploaded_file = st.file_uploader("Upload a Prompt Template file", type=["txt", "j2"])
538
- if uploaded_file is not None:
539
- input_content = uploaded_file
540
- elif input_method == "Image":
541
- uploaded_file = st.file_uploader("Upload an Image file", type=["png", "jpg", "jpeg"])
542
- if uploaded_file is not None:
543
- input_content = uploaded_file
544
-
545
- return input_method, input_content
546
-
547
-
548
- def main():
549
- """Enterprise-grade user interface"""
550
  st.set_page_config(
551
  page_title="Synthetic Data Factory Pro",
552
  page_icon="🏭",
553
- layout="wide",
554
  )
555
 
556
  gen = SyntheticDataGenerator()
@@ -558,56 +336,55 @@ def main():
558
  st.title("🏭 Synthetic Data Factory Pro")
559
  st.markdown(
560
  """
561
- **World's Most Advanced Synthetic Data Generation Platform**
562
- *Multi-provider AI Engine | Enterprise Input Processors | Real-time Monitoring*
563
- """
564
  )
565
 
566
  provider_config_ui(gen)
567
 
568
- input_method, input_content = input_ui()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
 
570
- if st.button("Generate Data"):
571
- if input_content:
572
- try:
573
- if input_method == "Text":
574
- processed_input = gen._process_text(input_content)
575
- elif input_method == "PDF":
576
- processed_input = gen._process_pdf(input_content)
577
- elif input_method == "Web URL":
578
- processed_input = gen._process_web(input_content)
579
- elif input_method == "CSV":
580
- processed_input = gen._process_csv(input_content)
581
- elif input_method == "Prompt Template":
582
- processed_input = gen._process_prompt_template(
583
- input_content) # Process the uploaded template
584
- elif input_method == "Image":
585
- processed_input = gen._process_image(input_content) # Returns a List of Parts
586
-
587
- # If a prompt template is loaded, use it.
588
- if st.session_state["prompt_template"] is not None and input_method != "Prompt Template":
589
- try:
590
- from jinja2 import Template # Conditionally import it.
591
-
592
- template = Template(st.session_state["prompt_template"]) # Load Jinja2 Template
593
- processed_input = template.render(
594
- input=processed_input) # Render the template - Overwrites the Input, Google needs parts, not text
595
-
596
- except Exception as e:
597
- st.error(f"Error rendering prompt template: {e}")
598
- st.stop() # Stop the app if template rendering fails
599
-
600
- if processed_input:
601
- result = gen.generate(st.session_state.active_provider, st.session_state.active_model,
602
- processed_input)
603
- st.subheader("Generated Output:")
604
- st.json(result)
605
- else:
606
- st.warning("No data to process. Please check your input.")
607
- except Exception as e:
608
- st.error(f"An unexpected error occurred: {e}")
609
- else:
610
- st.warning("Please provide input data.")
611
 
612
  if __name__ == "__main__":
613
- main()
 
1
+ import json
2
  import requests
3
  import streamlit as st
4
  import pdfplumber
 
6
  import sqlalchemy
7
  from typing import Any, Dict, List, Optional, Union
8
  from functools import lru_cache
 
 
 
 
 
 
 
9
 
10
  # Provider clients with import guards
11
  try:
 
18
  except ImportError:
19
  groq = None
20
 
 
 
 
 
 
 
 
 
 
21
 
22
  class SyntheticDataGenerator:
23
  """World's Most Advanced Synthetic Data Generation System"""
24
 
25
+ PROVIDER_CONFIG: Dict[str, Dict[str, Union[str, List[str], Optional[str]]]] = {
26
  "Deepseek": {
27
  "base_url": "https://api.deepseek.com/v1",
28
  "models": ["deepseek-chat"],
29
  "requires_library": "openai",
 
30
  },
31
  "OpenAI": {
32
  "base_url": "https://api.openai.com/v1",
33
  "models": ["gpt-4-turbo", "gpt-3.5-turbo"],
34
  "requires_library": "openai",
 
35
  },
36
  "Groq": {
37
  "base_url": "https://api.groq.com/openai/v1",
38
  "models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
39
  "requires_library": "groq",
 
40
  },
41
  "HuggingFace": {
42
  "base_url": "https://api-inference.huggingface.co/models/",
43
  "models": ["gpt2", "llama-2-13b-chat"],
44
  "requires_library": None,
 
 
 
 
 
 
45
  },
46
  }
47
 
48
+ def __init__(self) -> None:
49
+ """Initialize session state, input handlers, and providers."""
50
  self._init_session_state()
51
  self._setup_input_handlers()
52
  self._setup_providers()
53
 
54
+ def _init_session_state(self) -> None:
55
+ """Initialize enterprise-grade session management with default values."""
56
  defaults = {
57
  "active_provider": "OpenAI",
58
  "api_keys": {},
 
63
  "tokens_used": 0,
64
  "error_count": 0,
65
  },
66
+ "error_logs": [],
67
  "debug_mode": False,
 
 
 
 
 
 
 
 
 
68
  }
69
  for key, val in defaults.items():
70
  if key not in st.session_state:
71
  st.session_state[key] = val
72
 
73
+ def _setup_providers(self) -> None:
74
+ """Configure available providers with health checks."""
75
+ self.available_providers: List[str] = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  for provider, config in self.PROVIDER_CONFIG.items():
77
+ required_lib = config.get("requires_library")
78
+ if required_lib and not globals().get(required_lib.title()):
79
  continue # Skip providers with missing dependencies
80
  self.available_providers.append(provider)
81
 
82
+ def _setup_input_handlers(self) -> None:
83
+ """Register enterprise input processors."""
84
  self.input_processors = {
85
  "text": self._process_text,
86
  "pdf": self._process_pdf,
 
88
  "api": self._process_api,
89
  "database": self._process_database,
90
  "web": self._process_web,
 
 
91
  }
92
 
 
93
  @lru_cache(maxsize=100)
94
+ def generate(self, provider: str, model: str, prompt: str) -> Dict[str, Any]:
95
+ """
96
+ Unified generation endpoint with failover support.
97
+ Caches results to improve performance.
98
+ """
99
  try:
100
  if provider not in self.available_providers:
101
+ raise ValueError(f"Provider {provider} is not available.")
102
 
103
  client = self._get_client(provider)
104
  if not client:
105
+ raise ConnectionError("Client initialization failed.")
106
 
107
  return self._execute_generation(client, provider, model, prompt)
108
 
109
  except Exception as e:
110
+ self._log_error(f"Generation Error with provider '{provider}': {str(e)}")
111
+ return self._failover_generation(provider, model, prompt)
112
 
113
  def _get_client(self, provider: str) -> Any:
114
+ """
115
+ Secure client initialization with connection pooling.
116
+ Raises ValueError if API key is missing.
117
+ """
118
  config = self.PROVIDER_CONFIG[provider]
119
+ api_key = st.session_state["api_keys"].get(provider, "")
120
 
121
+ if not api_key:
122
+ raise ValueError(f"API key required for provider {provider}.")
123
 
124
  try:
125
  if provider == "Groq":
126
  return groq.Groq(api_key=api_key)
127
  elif provider == "HuggingFace":
128
  return {"headers": {"Authorization": f"Bearer {api_key}"}}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  else:
130
+ # Initialize OpenAI client for OpenAI and Deepseek
131
  return OpenAI(
132
  base_url=config["base_url"],
133
  api_key=api_key,
134
+ timeout=30
135
  )
136
  except Exception as e:
137
+ self._log_error(f"Client Initialization Failed for {provider}: {str(e)}")
138
  return None
139
 
140
+ def _execute_generation(self, client: Any, provider: str, model: str, prompt: str) -> Dict[str, Any]:
141
+ """
142
+ Execute provider-specific generation with circuit breaker.
143
+ Increments API call and token usage metrics.
144
+ """
145
+ st.session_state["system_metrics"]["api_calls"] += 1
146
+
147
+ if provider == "HuggingFace":
148
+ url = self.PROVIDER_CONFIG[provider]["base_url"] + model
149
+ response = requests.post(
150
+ url,
151
+ headers=client["headers"],
152
+ json={"inputs": prompt},
153
+ timeout=30
154
+ )
155
+ response.raise_for_status()
156
+ return response.json()
157
+ else:
158
+ completion = client.chat.completions.create(
159
+ model=model,
160
+ messages=[{"role": "user", "content": prompt}],
161
+ temperature=0.1,
162
+ max_tokens=2000
163
+ )
164
+ # Update token usage if available
165
+ if hasattr(completion.usage, "total_tokens"):
166
+ st.session_state["system_metrics"]["tokens_used"] += completion.usage.total_tokens
167
+ try:
168
+ result = json.loads(completion.choices[0].message.content)
169
+ except json.JSONDecodeError:
170
+ result = {"response": completion.choices[0].message.content}
171
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
+ def _failover_generation(self, original_provider: str, model: str, prompt: str) -> Dict[str, Any]:
174
+ """
175
+ Enterprise failover to secondary providers.
176
+ Tries available providers (excluding the original) until one succeeds.
177
+ """
178
  for backup_provider in self.available_providers:
179
+ if backup_provider == original_provider:
180
+ continue
181
+ # Determine model to use: if the desired model is available, use it; otherwise use the first available model.
182
+ backup_models = self.PROVIDER_CONFIG[backup_provider]["models"]
183
+ backup_model = model if model in backup_models else backup_models[0]
184
+ try:
185
+ st.session_state["active_provider"] = backup_provider
186
+ result = self.generate(backup_provider, backup_model, prompt)
187
+ # Optionally, log the failover event
188
+ self._log_error(f"Failover succeeded with provider '{backup_provider}' using model '{backup_model}'.")
189
+ return result
190
+ except Exception as e:
191
+ self._log_error(f"Failover attempt with provider '{backup_provider}' failed: {str(e)}")
192
+ continue
193
+ raise RuntimeError("All generation providers are unavailable.")
194
 
195
  # --- Input Processors ---
196
+ def _process_text(self, text: str) -> str:
197
+ """Process plain text input by stripping unnecessary whitespace."""
198
+ return text.strip()
199
+
200
  def _process_pdf(self, file) -> str:
201
+ """Advanced PDF processing with OCR fallback."""
202
  try:
203
  with pdfplumber.open(file) as pdf:
204
  return "\n".join(page.extract_text() or "" for page in pdf.pages)
 
206
  self._log_error(f"PDF Processing Error: {str(e)}")
207
  return ""
208
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  def _process_csv(self, file) -> str:
210
+ """Process CSV input by reading it into a DataFrame and converting to CSV string."""
211
  try:
212
  df = pd.read_csv(file)
213
+ return df.to_csv(index=False)
 
214
  except Exception as e:
215
  self._log_error(f"CSV Processing Error: {str(e)}")
216
  return ""
217
 
218
+ def _process_api(self, api_url: str) -> str:
219
+ """Process API input by fetching JSON data from an endpoint."""
 
 
 
 
220
  try:
221
+ response = requests.get(api_url, timeout=10)
222
+ response.raise_for_status()
223
+ data = response.json()
224
+ return json.dumps(data, indent=2)
225
  except Exception as e:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  self._log_error(f"API Processing Error: {str(e)}")
227
  return ""
 
 
 
228
 
229
+ def _process_database(self, config: Dict[str, str]) -> str:
230
+ """
231
+ Process database input by executing a query.
232
+ Expects a configuration dictionary with 'connection_string' and 'query' keys.
233
+ """
234
  try:
235
+ connection_string = config.get("connection_string", "")
236
+ query = config.get("query", "")
237
+ if not connection_string or not query:
238
+ raise ValueError("Missing connection string or query.")
239
  engine = sqlalchemy.create_engine(connection_string)
240
  with engine.connect() as connection:
241
+ result = pd.read_sql(query, connection)
242
+ return result.to_csv(index=False)
 
243
  except Exception as e:
244
  self._log_error(f"Database Processing Error: {str(e)}")
245
  return ""
246
 
247
+ def _process_web(self, url: str) -> str:
248
+ """Web content extraction with anti-bot measures."""
249
+ try:
250
+ response = requests.get(url, headers={
251
+ "User-Agent": "Mozilla/5.0 (compatible; SyntheticBot/1.0)"
252
+ }, timeout=10)
253
+ response.raise_for_status()
254
+ return response.text
255
+ except Exception as e:
256
+ self._log_error(f"Web Extraction Error: {str(e)}")
257
+ return ""
258
+
259
  # --- Enterprise Features ---
260
  def _log_error(self, message: str) -> None:
261
+ """Centralized error logging with telemetry."""
262
+ st.session_state["system_metrics"]["error_count"] += 1
263
+ st.session_state["error_logs"].append(message)
264
+ if st.session_state.get("debug_mode"):
 
265
  st.error(f"[DEBUG] {message}")
266
 
267
  def health_check(self) -> Dict[str, Any]:
268
+ """Comprehensive system diagnostics."""
269
  return {
270
  "providers_available": self.available_providers,
271
  "api_connectivity": {
272
  provider: self._test_provider_connectivity(provider)
273
  for provider in self.available_providers
274
  },
275
+ "system_metrics": st.session_state["system_metrics"],
276
  }
277
 
278
  def _test_provider_connectivity(self, provider: str) -> bool:
279
+ """Provider-specific connectivity test."""
280
  try:
281
  client = self._get_client(provider)
282
  if provider == "HuggingFace":
283
+ url = self.PROVIDER_CONFIG[provider]["base_url"]
284
+ response = requests.get(url, headers=client["headers"], timeout=5)
 
 
 
285
  return response.status_code == 200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
  else:
287
+ # Assuming client has a models.list() method for other providers.
288
+ client.models.list()
289
+ return True
290
  except Exception:
291
  return False
292
 
293
+
294
  # --- Enterprise UI Components ---
295
+ def provider_config_ui(gen: SyntheticDataGenerator) -> None:
296
+ """Advanced provider configuration interface."""
297
  with st.sidebar:
298
  st.header("⚙️ AI Engine Configuration")
 
 
299
  provider = st.selectbox(
300
  "AI Provider",
301
  gen.available_providers,
302
  help="Available providers based on system configuration",
303
+ index=gen.available_providers.index(st.session_state.get("active_provider", "OpenAI"))
304
  )
305
+ st.session_state["active_provider"] = provider
306
 
 
307
  api_key = st.text_input(
308
  f"{provider} API Key",
309
  type="password",
310
+ value=st.session_state["api_keys"].get(provider, ""),
311
+ help=f"Obtain API key from {provider}'s portal"
312
  )
313
+ st.session_state["api_keys"][provider] = api_key
314
 
 
315
  model = st.selectbox(
316
  "Model",
317
  gen.PROVIDER_CONFIG[provider]["models"],
318
+ help="Select model version based on your API plan"
319
  )
320
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
  if st.button("Run Health Check"):
322
  report = gen.health_check()
323
  st.json(report)
324
 
325
+
326
+ def main() -> None:
327
+ """Enterprise-grade user interface."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
328
  st.set_page_config(
329
  page_title="Synthetic Data Factory Pro",
330
  page_icon="🏭",
331
+ layout="wide"
332
  )
333
 
334
  gen = SyntheticDataGenerator()
 
336
  st.title("🏭 Synthetic Data Factory Pro")
337
  st.markdown(
338
  """
339
+ **World's Most Advanced Synthetic Data Generation Platform**
340
+ *Multi-provider AI Engine | Enterprise Input Processors | Real-time Monitoring*
341
+ """
342
  )
343
 
344
  provider_config_ui(gen)
345
 
346
+ # Example: Input management and processing
347
+ st.subheader("Input Data")
348
+ input_type = st.selectbox("Select Input Type", list(gen.input_processors.keys()))
349
+ if input_type == "text":
350
+ user_input = st.text_area("Enter your text here:")
351
+ elif input_type == "pdf":
352
+ user_input = st.file_uploader("Upload a PDF file", type=["pdf"])
353
+ elif input_type == "csv":
354
+ user_input = st.file_uploader("Upload a CSV file", type=["csv"])
355
+ elif input_type == "api":
356
+ user_input = st.text_input("Enter API URL:")
357
+ elif input_type == "database":
358
+ user_input = st.text_area("Enter Database Config as JSON (with 'connection_string' and 'query'):")
359
+ elif input_type == "web":
360
+ user_input = st.text_input("Enter Website URL:")
361
+
362
+ processed_input = ""
363
+ if st.button("Process Input"):
364
+ processor = gen.input_processors.get(input_type)
365
+ if processor:
366
+ if input_type in ["pdf", "csv"]:
367
+ processed_input = processor(user_input)
368
+ elif input_type == "database":
369
+ try:
370
+ db_config = json.loads(user_input)
371
+ processed_input = processor(db_config)
372
+ except json.JSONDecodeError:
373
+ st.error("Invalid JSON for database configuration.")
374
+ else:
375
+ processed_input = processor(user_input)
376
+ st.text_area("Processed Input", value=processed_input, height=200)
377
+
378
+ # Generation section
379
+ st.subheader("Generate Synthetic Data")
380
+ prompt = st.text_area("Enter your prompt for data generation:")
381
+ if st.button("Generate"):
382
+ active_provider = st.session_state.get("active_provider", "OpenAI")
383
+ # Allow model selection for the generation step
384
+ model = st.selectbox("Select Generation Model", gen.PROVIDER_CONFIG[active_provider]["models"])
385
+ result = gen.generate(active_provider, model, prompt)
386
+ st.json(result)
387
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
388
 
389
  if __name__ == "__main__":
390
+ main()