mgbam commited on
Commit
2f242fe
·
verified ·
1 Parent(s): 8cd330b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +231 -190
app.py CHANGED
@@ -3,10 +3,16 @@ import streamlit as st
3
  import pdfplumber
4
  import pandas as pd
5
  import sqlalchemy
6
- from typing import Any, Dict, List, Optional
7
  from functools import lru_cache
 
8
  import os
9
 
 
 
 
 
 
10
  # Provider clients with import guards
11
  try:
12
  from openai import OpenAI
@@ -27,7 +33,6 @@ except ImportError:
27
  genai = None
28
  Part = None
29
 
30
- import json
31
 
32
  class SyntheticDataGenerator:
33
  """World's Most Advanced Synthetic Data Generation System"""
@@ -36,27 +41,32 @@ class SyntheticDataGenerator:
36
  "Deepseek": {
37
  "base_url": "https://api.deepseek.com/v1",
38
  "models": ["deepseek-chat"],
39
- "requires_library": "openai"
 
40
  },
41
  "OpenAI": {
42
  "base_url": "https://api.openai.com/v1",
43
  "models": ["gpt-4-turbo", "gpt-3.5-turbo"],
44
- "requires_library": "openai"
 
45
  },
46
  "Groq": {
47
  "base_url": "https://api.groq.com/openai/v1",
48
  "models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
49
- "requires_library": "groq"
 
50
  },
51
  "HuggingFace": {
52
  "base_url": "https://api-inference.huggingface.co/models/",
53
  "models": ["gpt2", "llama-2-13b-chat"],
54
- "requires_library": None
 
55
  },
56
  "Google": {
57
  "models": ["gemini-1.5-flash-latest", "gemini-1.5-pro-latest", "gemini-pro", "gemini-pro-vision"],
58
- "requires_library": "google.generativeai"
59
- }
 
60
  }
61
 
62
  def __init__(self):
@@ -74,29 +84,50 @@ class SyntheticDataGenerator:
74
  "system_metrics": {
75
  "api_calls": 0,
76
  "tokens_used": 0,
77
- "error_count": 0
78
  },
79
  "debug_mode": False,
80
- "google_configured": False,
81
- "advanced_options": {
82
- "temperature": 0.7,
83
- "top_p": 0.95,
84
- "top_k": 40,
85
- "max_output_tokens": 2000
86
- },
87
- "generation_format": "json",
88
- "csv_schema": ""
89
  }
90
  for key, val in defaults.items():
91
  if key not in st.session_state:
92
  st.session_state[key] = val
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def _setup_providers(self):
95
  """Configure available providers with health checks"""
96
  self.available_providers = []
97
  for provider, config in self.PROVIDER_CONFIG.items():
98
  if config["requires_library"] and not globals().get(config["requires_library"].split('.')[0].title()):
99
- continue
100
  self.available_providers.append(provider)
101
 
102
  def _setup_input_handlers(self):
@@ -108,12 +139,13 @@ class SyntheticDataGenerator:
108
  "api": self._process_api,
109
  "database": self._process_database,
110
  "web": self._process_web,
111
- "image": self._process_image
 
112
  }
113
 
114
  # --- Core Generation Engine ---
115
  @lru_cache(maxsize=100)
116
- def generate(self, provider: str, model: str, prompt: Any) -> Dict[str, Any]: # Allow "prompt" to be a list or a string
117
  """Unified generation endpoint with failover support"""
118
  try:
119
  if provider not in self.available_providers:
@@ -135,7 +167,7 @@ class SyntheticDataGenerator:
135
  api_key = st.session_state.api_keys.get(provider, "")
136
 
137
  if not api_key and provider != "Google":
138
- raise ValueError("API key required")
139
 
140
  try:
141
  if provider == "Groq":
@@ -158,92 +190,84 @@ class SyntheticDataGenerator:
158
  raise ValueError(f"Error configuring Google API: {e}")
159
 
160
  generation_config = genai.GenerationConfig(
161
- temperature=st.session_state.advanced_options["temperature"],
162
- top_p=st.session_state.advanced_options["top_p"],
163
- top_k=st.session_state.advanced_options["top_k"],
164
- max_output_tokens=st.session_state.advanced_options["max_output_tokens"]
165
  )
166
- safety_settings = [
167
- {
168
- "category": "HARM_CATEGORY_HARASSMENT",
169
- "threshold": "BLOCK_MEDIUM_AND_ABOVE"
170
- },
171
- {
172
- "category": "HARM_CATEGORY_HATE_SPEECH",
173
- "threshold": "BLOCK_MEDIUM_AND_ABOVE"
174
- },
175
- {
176
- "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
177
- "threshold": "BLOCK_MEDIUM_AND_ABOVE"
178
- },
179
- {
180
- "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
181
- "threshold": "BLOCK_MEDIUM_AND_ABOVE"
182
- },
183
- ]
184
- return GenerativeModel(model_name=model, generation_config=generation_config, safety_settings=safety_settings)
185
  else:
186
  return OpenAI(
187
  base_url=config["base_url"],
188
  api_key=api_key,
189
- timeout=30
190
  )
191
  except Exception as e:
192
  self._log_error(f"Client Init Failed: {str(e)}")
193
  return None
194
 
195
- def _execute_generation(self, client, provider: str, model: str, prompt: Any) -> Dict[str, Any]: # Use Any for prompt type
196
-
197
  """Execute provider-specific generation with circuit breaker"""
198
  st.session_state.system_metrics["api_calls"] += 1
199
 
200
- if provider == "HuggingFace":
201
- response = requests.post(
202
- self.PROVIDER_CONFIG[provider]["base_url"] + model,
203
- headers=client["headers"],
204
- json={"inputs": prompt},
205
- timeout=30
206
- )
207
- response.raise_for_status()
208
- return response.json()
209
- elif provider == "Google":
210
- try:
211
- if isinstance(prompt, list): #Multimodal case
212
-
213
- response = client.generate_content(prompt)
214
 
 
 
215
  else:
 
216
 
217
- response = client.generate_content(prompt)
218
-
219
 
220
  content = response.text
221
-
222
- if st.session_state.generation_format == "json":
223
  try:
224
  return json.loads(content)
225
  except json.JSONDecodeError:
226
  return {"content": content,
227
  "warning": "Could not parse response as valid JSON. Returning raw text."}
228
  else:
229
- return {"content": content}
230
 
231
- except Exception as e:
232
- self._log_error(f"Google Generation Error: {str(e)}")
233
- return {"error": str(e), "content": ""}
234
- else:
235
- completion = client.chat.completions.create(
236
- model=model,
237
- messages=[{"role": "user", "content": prompt}],
238
- temperature=st.session_state.advanced_options["temperature"],
239
- max_tokens=st.session_state.advanced_options["max_output_tokens"]
240
- )
241
- st.session_state.system_metrics["tokens_used"] += completion.usage.total_tokens
242
- try:
243
- return json.loads(completion.choices[0].message.content)
244
- except json.JSONDecodeError:
245
- return {"content": completion.choices[0].message.content,
246
- "warning": "Could not parse response as valid JSON. Returning raw text."}
 
 
 
 
 
 
 
 
 
247
 
248
  def _failover_generation(self, prompt: str) -> Dict[str, Any]:
249
  """Enterprise failover to secondary providers"""
@@ -271,19 +295,20 @@ class SyntheticDataGenerator:
271
  response = requests.get(url, headers={
272
  "User-Agent": "Mozilla/5.0 (compatible; SyntheticBot/1.0)"
273
  }, timeout=10)
 
274
  return response.text
275
- except Exception as e:
276
  self._log_error(f"Web Extraction Error: {str(e)}")
277
  return ""
 
 
 
278
 
279
  def _process_csv(self, file) -> str:
280
  """Process CSV files and return as a string representation."""
281
  try:
282
  df = pd.read_csv(file)
283
- column_names = df.columns.tolist()
284
- data_types = [str(df[col].dtype) for col in df.columns]
285
- schema_prompt = f"Column Names: {column_names}\nData Types: {data_types}"
286
- st.session_state.csv_schema = schema_prompt
287
  return df.to_string()
288
  except Exception as e:
289
  self._log_error(f"CSV Processing Error: {str(e)}")
@@ -293,17 +318,40 @@ class SyntheticDataGenerator:
293
  """Simple text passthrough processor"""
294
  return text
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
  def _process_api(self, url: str, method="GET", headers: Optional[Dict[str, str]] = None,
297
  data: Optional[Dict[str, Any]] = None) -> str:
298
  """Generic API endpoint processor with configurable methods and headers."""
299
  try:
300
  if method.upper() == "GET":
301
- response = requests.get(url, headers=headers or {}, timeout=10)
 
302
  elif method.upper() == "POST":
303
- response = requests.post(url, headers=headers or {}, json=data, timeout=10)
 
304
  else:
305
  raise ValueError("Unsupported HTTP method.")
306
- response.raise_for_status()
307
 
308
  try:
309
  return json.dumps(response.json(), indent=2)
@@ -312,6 +360,9 @@ class SyntheticDataGenerator:
312
  except requests.exceptions.RequestException as e:
313
  self._log_error(f"API Processing Error: {str(e)}")
314
  return ""
 
 
 
315
 
316
  def _process_database(self, connection_string: str, query: str) -> str:
317
  """Database query processor using SQLAlchemy."""
@@ -325,17 +376,6 @@ class SyntheticDataGenerator:
325
  self._log_error(f"Database Processing Error: {str(e)}")
326
  return ""
327
 
328
- def _process_image(self, image_file) -> list: #Returns a list
329
- """Processes image files for multimodal generation (Google Gemini)"""
330
- try:
331
- image_data = image_file.read()
332
- image_part = Part.from_data(image_data, mime_type=image_file.type) #Use Part for google
333
- return [image_part] #Return a list with the image part as a Google Part object
334
-
335
- except Exception as e:
336
- self._log_error(f"Image Processing Error: {str(e)}")
337
- return []
338
-
339
  # --- Enterprise Features ---
340
  def _log_error(self, message: str) -> None:
341
  """Centralized error logging with telemetry"""
@@ -353,7 +393,7 @@ class SyntheticDataGenerator:
353
  provider: self._test_provider_connectivity(provider)
354
  for provider in self.available_providers
355
  },
356
- "system_metrics": st.session_state.system_metrics
357
  }
358
 
359
  def _test_provider_connectivity(self, provider: str) -> bool:
@@ -369,24 +409,26 @@ class SyntheticDataGenerator:
369
  return response.status_code == 200
370
  elif provider == "Google":
371
  try:
372
- if not st.session_state.google_configured: #Check if google has been configured
373
 
374
- api_key = st.session_state.api_keys.get("Google", "") #Get Key from session state
 
375
 
376
- if not api_key: #If that is not set, check environment variable.
377
  api_key = os.environ.get("GOOGLE_API_KEY")
378
 
379
  if not api_key:
380
- return False #Cant test API if no API Key
381
 
382
- configure(api_key=api_key) #Configure API Key
383
  st.session_state.google_configured = True
384
- #st.write("configuring key")
385
 
386
- genai.GenerativeModel(model_name=self.PROVIDER_CONFIG["Google"]["models"][0]).generate_content("test") #Test a generation
 
387
  return True
388
 
389
- except Exception as e: #Catch any exceptions
390
  print(e)
391
  return False
392
 
@@ -406,7 +448,7 @@ def provider_config_ui(gen: SyntheticDataGenerator):
406
  provider = st.selectbox(
407
  "AI Provider",
408
  gen.available_providers,
409
- help="Available providers based on system configuration"
410
  )
411
  st.session_state.active_provider = provider
412
 
@@ -415,7 +457,7 @@ def provider_config_ui(gen: SyntheticDataGenerator):
415
  f"{provider} API Key",
416
  type="password",
417
  value=st.session_state.api_keys.get(provider, ""),
418
- help=f"Obtain API key from {provider} portal"
419
  )
420
  st.session_state.api_keys[provider] = api_key
421
 
@@ -423,50 +465,49 @@ def provider_config_ui(gen: SyntheticDataGenerator):
423
  model = st.selectbox(
424
  "Model",
425
  gen.PROVIDER_CONFIG[provider]["models"],
426
- help="Select model version based on your API plan"
427
  )
428
  st.session_state.active_model = model
429
 
430
- # Advanced Options
431
- if provider == "Google" or provider == "OpenAI":
432
- st.subheader("Advanced Generation Options")
433
- st.session_state.advanced_options["temperature"] = st.slider("Temperature", min_value=0.0,
434
- max_value=1.0,
435
- value=st.session_state.advanced_options[
436
- "temperature"], step=0.05,
437
- help="Controls randomness. Lower values = more deterministic.")
438
- if provider == "Google":
439
- st.session_state.advanced_options["top_p"] = st.slider("Top P", min_value=0.0, max_value=1.0,
440
- value=st.session_state.advanced_options["top_p"],
441
- step=0.05,
442
- help="Nucleus sampling: Considers the most probable tokens.")
443
- st.session_state.advanced_options["top_k"] = st.slider("Top K", min_value=1, max_value=100,
444
- value=st.session_state.advanced_options["top_k"],
445
- step=1,
446
- help="Considers the top K most probable tokens.")
447
-
448
- st.session_state.advanced_options["max_output_tokens"] = st.number_input("Max Output Tokens",
449
- min_value=50, max_value=4096,
450
- value=st.session_state.advanced_options[
451
- "max_output_tokens"], step=50,
452
- help="Maximum number of tokens in the generated output.")
453
-
454
- st.session_state.generation_format = st.selectbox("Output Format", ["json", "text"],
455
- help="Choose the desired output format.")
456
 
457
  # System monitoring
458
  if st.button("Run Health Check"):
459
  report = gen.health_check()
460
  st.json(report)
461
 
462
-
463
  def input_ui():
464
  """Creates the input method UI"""
465
  input_method = st.selectbox("Input Method",
466
- ["Text", "PDF", "Web URL", "CSV", "Image",
467
- "Structured Prompt (Advanced)"]) # Add Image input, Add Structured Prompt (Advanced)
 
468
  input_content = None
469
- additional_instructions = "" # For structured prompt
470
 
471
  if input_method == "Text":
472
  input_content = st.text_area("Enter Text", height=200)
@@ -481,22 +522,16 @@ def input_ui():
481
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
482
  if uploaded_file is not None:
483
  input_content = uploaded_file
484
- if "csv_schema" in st.session_state:
485
- st.write("Inferred CSV Schema:")
486
- st.write(st.session_state.csv_schema)
487
-
488
  elif input_method == "Image":
489
  uploaded_file = st.file_uploader("Upload an Image file", type=["png", "jpg", "jpeg"])
490
  if uploaded_file is not None:
491
  input_content = uploaded_file
492
 
493
- elif input_method == "Structured Prompt (Advanced)":
494
- st.subheader("Structured Prompt")
495
- input_content = st.text_area("Enter the base prompt/instructions", height=100)
496
- additional_instructions = st.text_area("Specify constraints, data format, or other requirements:",
497
- height=100)
498
-
499
- return input_method, input_content, additional_instructions
500
 
501
 
502
  def main():
@@ -504,56 +539,62 @@ def main():
504
  st.set_page_config(
505
  page_title="Synthetic Data Factory Pro",
506
  page_icon="🏭",
507
- layout="wide"
508
  )
509
 
510
  gen = SyntheticDataGenerator()
511
 
512
  st.title("🏭 Synthetic Data Factory Pro")
513
- st.markdown("""
 
514
  **World's Most Advanced Synthetic Data Generation Platform**
515
  *Multi-provider AI Engine | Enterprise Input Processors | Real-time Monitoring*
516
- """)
 
517
 
518
  provider_config_ui(gen)
519
 
520
- input_method, input_content, additional_instructions = input_ui()
521
 
522
  if st.button("Generate Data"):
523
- if input_content or input_method == "Structured Prompt (Advanced)":
524
- processed_input = None
525
-
526
- if input_method == "Text":
527
- processed_input = gen._process_text(input_content)
528
- elif input_method == "PDF":
529
- processed_input = gen._process_pdf(input_content)
530
- elif input_method == "Web URL":
531
- processed_input = gen._process_web(input_content)
532
- elif input_method == "CSV":
533
- processed_input = gen._process_csv(input_content)
534
- elif input_method == "Image":
535
- processed_input = gen._process_image(input_content) #This is a list now
536
- if not processed_input: #If something went wrong with image processing, don't proceed
537
- st.error("Error processing image.")
538
- return
539
-
540
- elif input_method == "Structured Prompt (Advanced)":
541
- processed_input = input_content + "\n" + additional_instructions
542
-
543
- if processed_input:
544
- try:
545
- if st.session_state.active_provider == "Google" and input_method == "Image":
546
- prompt_parts = [input_content] + processed_input #Keeps text and images separate for google
547
- result = gen.generate(st.session_state.active_provider, st.session_state.active_model, prompt_parts)
548
- else:
549
- result = gen.generate(st.session_state.active_provider, st.session_state.active_model, processed_input)
550
 
 
 
 
 
 
 
 
 
 
 
 
551
  st.subheader("Generated Output:")
552
  st.json(result)
553
- except Exception as e:
554
- st.error(f"Error during generation: {e}")
555
- else:
556
- st.warning("No data to process. Please check your input.")
557
  else:
558
  st.warning("Please provide input data.")
559
 
 
3
  import pdfplumber
4
  import pandas as pd
5
  import sqlalchemy
6
+ from typing import Any, Dict, List, Optional, Union
7
  from functools import lru_cache
8
+ import json # Explicit import
9
  import os
10
 
11
+ # --- Constants ---
12
+ DEFAULT_TEMPERATURE = 0.1
13
+ DEFAULT_MAX_TOKENS = 2000
14
+ API_TIMEOUT = 30
15
+
16
  # Provider clients with import guards
17
  try:
18
  from openai import OpenAI
 
33
  genai = None
34
  Part = None
35
 
 
36
 
37
  class SyntheticDataGenerator:
38
  """World's Most Advanced Synthetic Data Generation System"""
 
41
  "Deepseek": {
42
  "base_url": "https://api.deepseek.com/v1",
43
  "models": ["deepseek-chat"],
44
+ "requires_library": "openai",
45
+ "supports_json_output": True, # Indicate that the provider reliably returns JSON
46
  },
47
  "OpenAI": {
48
  "base_url": "https://api.openai.com/v1",
49
  "models": ["gpt-4-turbo", "gpt-3.5-turbo"],
50
+ "requires_library": "openai",
51
+ "supports_json_output": True,
52
  },
53
  "Groq": {
54
  "base_url": "https://api.groq.com/openai/v1",
55
  "models": ["mixtral-8x7b-32768", "llama2-70b-4096"],
56
+ "requires_library": "groq",
57
+ "supports_json_output": True,
58
  },
59
  "HuggingFace": {
60
  "base_url": "https://api-inference.huggingface.co/models/",
61
  "models": ["gpt2", "llama-2-13b-chat"],
62
+ "requires_library": None,
63
+ "supports_json_output": False, # More likely to return text
64
  },
65
  "Google": {
66
  "models": ["gemini-1.5-flash-latest", "gemini-1.5-pro-latest", "gemini-pro", "gemini-pro-vision"],
67
+ "requires_library": "google.generativeai",
68
+ "supports_json_output": True
69
+ },
70
  }
71
 
72
  def __init__(self):
 
84
  "system_metrics": {
85
  "api_calls": 0,
86
  "tokens_used": 0,
87
+ "error_count": 0,
88
  },
89
  "debug_mode": False,
90
+ "temperature": DEFAULT_TEMPERATURE, # Add temperature control
91
+ "max_tokens": DEFAULT_MAX_TOKENS, # Add max token control
92
+ "use_streaming": False, # Control Streaming behavior
93
+ "prompt_template": None, # Support prompt templates
94
+ "api_call_timeout": API_TIMEOUT, # API call timeout
95
+ "image_parts": [], # Store image parts for multimodal generation
96
+ "top_p": 0.95, # Default top_p for Google
97
+ "top_k": 40, # Default top_k for Google
98
+ "safety_settings": self._get_default_safety_settings(), #Default Safety Settings
99
  }
100
  for key, val in defaults.items():
101
  if key not in st.session_state:
102
  st.session_state[key] = val
103
 
104
+ def _get_default_safety_settings(self):
105
+ """Provides a default safety setting configuration for the Google provider"""
106
+ return [
107
+ {
108
+ "category": "HARM_CATEGORY_HARASSMENT",
109
+ "threshold": "BLOCK_MEDIUM_AND_ABOVE"
110
+ },
111
+ {
112
+ "category": "HARM_CATEGORY_HATE_SPEECH",
113
+ "threshold": "BLOCK_MEDIUM_AND_ABOVE"
114
+ },
115
+ {
116
+ "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
117
+ "threshold": "BLOCK_MEDIUM_AND_ABOVE"
118
+ },
119
+ {
120
+ "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
121
+ "threshold": "BLOCK_MEDIUM_AND_ABOVE"
122
+ },
123
+ ]
124
+
125
  def _setup_providers(self):
126
  """Configure available providers with health checks"""
127
  self.available_providers = []
128
  for provider, config in self.PROVIDER_CONFIG.items():
129
  if config["requires_library"] and not globals().get(config["requires_library"].split('.')[0].title()):
130
+ continue # Skip providers with missing dependencies
131
  self.available_providers.append(provider)
132
 
133
  def _setup_input_handlers(self):
 
139
  "api": self._process_api,
140
  "database": self._process_database,
141
  "web": self._process_web,
142
+ "prompt_template": self._process_prompt_template,
143
+ "image": self._process_image,
144
  }
145
 
146
  # --- Core Generation Engine ---
147
  @lru_cache(maxsize=100)
148
+ def generate(self, provider: str, model: str, prompt: Any) -> Dict[str, Any]:
149
  """Unified generation endpoint with failover support"""
150
  try:
151
  if provider not in self.available_providers:
 
167
  api_key = st.session_state.api_keys.get(provider, "")
168
 
169
  if not api_key and provider != "Google":
170
+ raise ValueError(f"API key required for provider: {provider}")
171
 
172
  try:
173
  if provider == "Groq":
 
190
  raise ValueError(f"Error configuring Google API: {e}")
191
 
192
  generation_config = genai.GenerationConfig(
193
+ temperature=st.session_state["temperature"],
194
+ top_p=st.session_state["top_p"],
195
+ top_k=st.session_state["top_k"],
196
+ max_output_tokens=st.session_state["max_tokens"],
197
  )
198
+ safety_settings = st.session_state["safety_settings"] #Get Safety Settings
199
+
200
+ return GenerativeModel(model_name=model, generation_config=generation_config,
201
+ safety_settings=safety_settings) # Use all settings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
  else:
203
  return OpenAI(
204
  base_url=config["base_url"],
205
  api_key=api_key,
206
+ timeout=st.session_state["api_call_timeout"], # Use session state timeout
207
  )
208
  except Exception as e:
209
  self._log_error(f"Client Init Failed: {str(e)}")
210
  return None
211
 
212
+ def _execute_generation(self, client, provider: str, model: str, prompt: Any) -> Dict[str, Any]:
 
213
  """Execute provider-specific generation with circuit breaker"""
214
  st.session_state.system_metrics["api_calls"] += 1
215
 
216
+ try:
217
+ if provider == "HuggingFace":
218
+ response = requests.post(
219
+ self.PROVIDER_CONFIG[provider]["base_url"] + model,
220
+ headers=client["headers"],
221
+ json={"inputs": prompt},
222
+ timeout=st.session_state["api_call_timeout"]
223
+ )
224
+ response.raise_for_status() # Raise HTTPError for bad responses
225
+ return response.json()
226
+ elif provider == "Google":
227
+ # Construct parts list. If prompt is already a list, assume it contains Parts and text
 
 
228
 
229
+ if isinstance(prompt, str):
230
+ parts = [prompt] #If plain text
231
  else:
232
+ parts = prompt #Multimodal prompt
233
 
234
+ response = client.generate_content(parts) # Send parts to Google
 
235
 
236
  content = response.text
237
+ if self.PROVIDER_CONFIG[provider]["supports_json_output"]:
 
238
  try:
239
  return json.loads(content)
240
  except json.JSONDecodeError:
241
  return {"content": content,
242
  "warning": "Could not parse response as valid JSON. Returning raw text."}
243
  else:
244
+ return {"content": content} #Return raw text
245
 
246
+ else:
247
+ completion = client.chat.completions.create(
248
+ model=model,
249
+ messages=[{"role": "user", "content": prompt}],
250
+ temperature=st.session_state["temperature"], # Get temperature from session
251
+ max_tokens=st.session_state["max_tokens"], # Get max_tokens from session
252
+ stream=st.session_state["use_streaming"], # Use streaming bool from session
253
+ )
254
+ st.session_state.system_metrics["tokens_used"] += completion.usage.total_tokens
255
+ content = completion.choices[0].message.content
256
+ # Attempt to parse JSON if supported, otherwise return text
257
+ if self.PROVIDER_CONFIG[provider]["supports_json_output"]:
258
+ try:
259
+ return json.loads(content)
260
+ except json.JSONDecodeError:
261
+ return {"content": content,
262
+ "warning": "Could not parse response as valid JSON. Returning raw text."}
263
+ else:
264
+ return {"content": content} # return raw text
265
+ except requests.exceptions.RequestException as e:
266
+ self._log_error(f"API Request Error: {str(e)}")
267
+ return {"error": str(e), "content": ""}
268
+ except Exception as e:
269
+ self._log_error(f"Generation Error: {str(e)}")
270
+ return {"error": str(e), "content": ""}
271
 
272
  def _failover_generation(self, prompt: str) -> Dict[str, Any]:
273
  """Enterprise failover to secondary providers"""
 
295
  response = requests.get(url, headers={
296
  "User-Agent": "Mozilla/5.0 (compatible; SyntheticBot/1.0)"
297
  }, timeout=10)
298
+ response.raise_for_status() # Raises HTTPError for bad responses (4xx or 5xx)
299
  return response.text
300
+ except requests.exceptions.RequestException as e:
301
  self._log_error(f"Web Extraction Error: {str(e)}")
302
  return ""
303
+ except Exception as e:
304
+ self._log_error(f"Unexpected Web Extraction Error: {str(e)}")
305
+ return ""
306
 
307
  def _process_csv(self, file) -> str:
308
  """Process CSV files and return as a string representation."""
309
  try:
310
  df = pd.read_csv(file)
311
+ # Add more sophisticated CSV processing here, e.g., schema inference
 
 
 
312
  return df.to_string()
313
  except Exception as e:
314
  self._log_error(f"CSV Processing Error: {str(e)}")
 
318
  """Simple text passthrough processor"""
319
  return text
320
 
321
+ def _process_prompt_template(self, file) -> str:
322
+ """Process prompt template file and store the content in session_state"""
323
+ try:
324
+ template_content = file.read().decode("utf-8") # Read file content
325
+ st.session_state["prompt_template"] = template_content # Store in session_state
326
+ return "Prompt template uploaded and stored." # Inform the user
327
+ except Exception as e:
328
+ self._log_error(f"Prompt Template Processing Error: {str(e)}")
329
+ return ""
330
+
331
+ def _process_image(self, image_file) -> list:
332
+ """Processes image files for multimodal generation (Google Gemini)"""
333
+ try:
334
+ image_data = image_file.read()
335
+ image_part = Part.from_data(image_data, mime_type=image_file.type) # Use Part for google
336
+ return [image_part] # Return a list with the image part as a Google Part object
337
+
338
+ except Exception as e:
339
+ self._log_error(f"Image Processing Error: {str(e)}")
340
+ return []
341
+
342
  def _process_api(self, url: str, method="GET", headers: Optional[Dict[str, str]] = None,
343
  data: Optional[Dict[str, Any]] = None) -> str:
344
  """Generic API endpoint processor with configurable methods and headers."""
345
  try:
346
  if method.upper() == "GET":
347
+ response = requests.get(url, headers=headers or {},
348
+ timeout=st.session_state["api_call_timeout"])
349
  elif method.upper() == "POST":
350
+ response = requests.post(url, headers=headers or {}, json=data,
351
+ timeout=st.session_state["api_call_timeout"])
352
  else:
353
  raise ValueError("Unsupported HTTP method.")
354
+ response.raise_for_status() # Raise HTTPError for bad responses
355
 
356
  try:
357
  return json.dumps(response.json(), indent=2)
 
360
  except requests.exceptions.RequestException as e:
361
  self._log_error(f"API Processing Error: {str(e)}")
362
  return ""
363
+ except Exception as e:
364
+ self._log_error(f"Unexpected API Processing Error: {str(e)}")
365
+ return ""
366
 
367
  def _process_database(self, connection_string: str, query: str) -> str:
368
  """Database query processor using SQLAlchemy."""
 
376
  self._log_error(f"Database Processing Error: {str(e)}")
377
  return ""
378
 
 
 
 
 
 
 
 
 
 
 
 
379
  # --- Enterprise Features ---
380
  def _log_error(self, message: str) -> None:
381
  """Centralized error logging with telemetry"""
 
393
  provider: self._test_provider_connectivity(provider)
394
  for provider in self.available_providers
395
  },
396
+ "system_metrics": st.session_state.system_metrics,
397
  }
398
 
399
  def _test_provider_connectivity(self, provider: str) -> bool:
 
409
  return response.status_code == 200
410
  elif provider == "Google":
411
  try:
412
+ if not st.session_state.google_configured: # Check if google has been configured
413
 
414
+ api_key = st.session_state.api_keys.get("Google",
415
+ "") # Get Key from session state
416
 
417
+ if not api_key: # If that is not set, check environment variable.
418
  api_key = os.environ.get("GOOGLE_API_KEY")
419
 
420
  if not api_key:
421
+ return False # Cant test API if no API Key
422
 
423
+ configure(api_key=api_key) # Configure API Key
424
  st.session_state.google_configured = True
425
+ # st.write("configuring key")
426
 
427
+ genai.GenerativeModel(model_name=self.PROVIDER_CONFIG["Google"]["models"][0]).generate_content(
428
+ "test") # Test a generation
429
  return True
430
 
431
+ except Exception as e: # Catch any exceptions
432
  print(e)
433
  return False
434
 
 
448
  provider = st.selectbox(
449
  "AI Provider",
450
  gen.available_providers,
451
+ help="Available providers based on system configuration",
452
  )
453
  st.session_state.active_provider = provider
454
 
 
457
  f"{provider} API Key",
458
  type="password",
459
  value=st.session_state.api_keys.get(provider, ""),
460
+ help=f"Obtain API key from {provider} portal",
461
  )
462
  st.session_state.api_keys[provider] = api_key
463
 
 
465
  model = st.selectbox(
466
  "Model",
467
  gen.PROVIDER_CONFIG[provider]["models"],
468
+ help="Select model version based on your API plan",
469
  )
470
  st.session_state.active_model = model
471
 
472
+ # Advanced options
473
+ st.subheader("Advanced Options")
474
+ st.session_state["temperature"] = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE, 0.05)
475
+ st.session_state["max_tokens"] = st.number_input("Max Tokens", 50, 4000, DEFAULT_MAX_TOKENS, 50)
476
+ st.session_state["use_streaming"] = st.checkbox("Enable Streaming")
477
+ st.session_state["api_call_timeout"] = st.slider("API Call Timeout (seconds)", 5, 60, API_TIMEOUT, 5)
478
+
479
+ # Google Specific Options
480
+ if provider == "Google":
481
+ st.subheader("Google Specific Settings")
482
+ st.session_state["top_p"] = st.slider("Top P", 0.0, 1.0, 0.95, 0.05, help="Nucleus sampling: Considers the most probable tokens.")
483
+ st.session_state["top_k"] = st.slider("Top K", 1, 100, 40, 1, help="Considers the top K most probable tokens.")
484
+
485
+ # Safety Settings Configuration
486
+ st.subheader("Safety Settings")
487
+ safety_categories = ["HARM_CATEGORY_HARASSMENT", "HARM_CATEGORY_HATE_SPEECH", "HARM_CATEGORY_SEXUALLY_EXPLICIT", "HARM_CATEGORY_DANGEROUS_CONTENT"]
488
+ threshold_options = ["BLOCK_NONE", "BLOCK_LOW_AND_ABOVE", "BLOCK_MEDIUM_AND_ABOVE", "BLOCK_ONLY_HIGH",]
489
+
490
+ for category in safety_categories:
491
+ threshold = st.selectbox(f"Threshold for {category}", options=threshold_options, index=2, key=f"{category}_threshold") # Start with Medium and Above
492
+ #Update Threshold
493
+ for setting in st.session_state["safety_settings"]:
494
+ if setting["category"] == category:
495
+ setting["threshold"] = threshold
496
+ break
497
+
498
 
499
  # System monitoring
500
  if st.button("Run Health Check"):
501
  report = gen.health_check()
502
  st.json(report)
503
 
 
504
  def input_ui():
505
  """Creates the input method UI"""
506
  input_method = st.selectbox("Input Method",
507
+ ["Text", "PDF", "Web URL", "CSV", "Prompt Template",
508
+ "Image"]) # Add Image input, Add Structured Prompt (Advanced)
509
+
510
  input_content = None
 
511
 
512
  if input_method == "Text":
513
  input_content = st.text_area("Enter Text", height=200)
 
522
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
523
  if uploaded_file is not None:
524
  input_content = uploaded_file
525
+ elif input_method == "Prompt Template":
526
+ uploaded_file = st.file_uploader("Upload a Prompt Template file", type=["txt", "j2"])
527
+ if uploaded_file is not None:
528
+ input_content = uploaded_file
529
  elif input_method == "Image":
530
  uploaded_file = st.file_uploader("Upload an Image file", type=["png", "jpg", "jpeg"])
531
  if uploaded_file is not None:
532
  input_content = uploaded_file
533
 
534
+ return input_method, input_content
 
 
 
 
 
 
535
 
536
 
537
  def main():
 
539
  st.set_page_config(
540
  page_title="Synthetic Data Factory Pro",
541
  page_icon="🏭",
542
+ layout="wide",
543
  )
544
 
545
  gen = SyntheticDataGenerator()
546
 
547
  st.title("🏭 Synthetic Data Factory Pro")
548
+ st.markdown(
549
+ """
550
  **World's Most Advanced Synthetic Data Generation Platform**
551
  *Multi-provider AI Engine | Enterprise Input Processors | Real-time Monitoring*
552
+ """
553
+ )
554
 
555
  provider_config_ui(gen)
556
 
557
+ input_method, input_content = input_ui()
558
 
559
  if st.button("Generate Data"):
560
+ if input_content:
561
+ try:
562
+ if input_method == "Text":
563
+ processed_input = gen._process_text(input_content)
564
+ elif input_method == "PDF":
565
+ processed_input = gen._process_pdf(input_content)
566
+ elif input_method == "Web URL":
567
+ processed_input = gen._process_web(input_content)
568
+ elif input_method == "CSV":
569
+ processed_input = gen._process_csv(input_content)
570
+ elif input_method == "Prompt Template":
571
+ processed_input = gen._process_prompt_template(
572
+ input_content) # Process the uploaded template
573
+ elif input_method == "Image":
574
+ processed_input = gen._process_image(input_content) # Returns a List of Parts
575
+
576
+ # If a prompt template is loaded, use it.
577
+ if st.session_state["prompt_template"] is not None and input_method != "Prompt Template":
578
+ try:
579
+ from jinja2 import Template # Conditionally import it.
 
 
 
 
 
 
 
580
 
581
+ template = Template(st.session_state["prompt_template"]) # Load Jinja2 Template
582
+ processed_input = template.render(
583
+ input=processed_input) # Render the template - Overwrites the Input, Google needs parts, not text
584
+
585
+ except Exception as e:
586
+ st.error(f"Error rendering prompt template: {e}")
587
+ st.stop() # Stop the app if template rendering fails
588
+
589
+ if processed_input:
590
+ result = gen.generate(st.session_state.active_provider, st.session_state.active_model,
591
+ processed_input)
592
  st.subheader("Generated Output:")
593
  st.json(result)
594
+ else:
595
+ st.warning("No data to process. Please check your input.")
596
+ except Exception as e:
597
+ st.error(f"An unexpected error occurred: {e}")
598
  else:
599
  st.warning("Please provide input data.")
600