Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

mgbam commited on Feb 9

Commit

8fa07b2

verified ·

1 Parent(s): a4956fa

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -89

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import json
 import ast
 import requests
 import streamlit as st
 import pdfplumber
@@ -7,6 +8,16 @@ import pandas as pd
 import sqlalchemy
 from typing import Any, Dict, List, Callable
 # Provider clients – ensure these libraries are installed
 try:
     from openai import OpenAI
@@ -18,7 +29,7 @@ try:
 except ImportError:
     groq = None
-# Hugging Face inference endpoint and defaults
 HF_API_URL: str = "https://api-inference.huggingface.co/models/"
 DEFAULT_TEMPERATURE: float = 0.1
 GROQ_MODEL: str = "mixtral-8x7b-32768"
@@ -26,28 +37,27 @@ GROQ_MODEL: str = "mixtral-8x7b-32768"
 class SyntheticDataGenerator:
     """
-    An advanced Synthetic Data Generator for creating training examples for fine-tuning.
-    The generator accepts various input sources and then uses an LLM provider to create
-    synthetic examples in JSON format. Each example is a dictionary with 'input' and 'output' keys.
     """
     def __init__(self) -> None:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
-        # Prompt template with dynamic {num_examples} parameter and escaped curly braces.
         self.custom_prompt_template: str = (
             "You are an expert in generating synthetic training data for fine-tuning. "
             "Generate {num_examples} training examples from the following data, formatted as a JSON list of dictionaries. "
             "Each dictionary must have keys 'input' and 'output'. "
-            "The examples should be clear, diverse, and based solely on the provided data. "
-            "Do not add any external information. \n\n"
             "Example JSON Output:\n"
             "[{{'input': 'sample input text 1', 'output': 'sample output text 1'}}, "
             "{{'input': 'sample input text 2', 'output': 'sample output text 2'}}]\n\n"
             "Now, generate {num_examples} training examples from this data:\n{data}"
         )
     def _setup_providers(self) -> None:
         """Configure available LLM providers and their client initialization routines."""
         self.providers: Dict[str, Dict[str, Any]] = {
@@ -68,9 +78,9 @@ class SyntheticDataGenerator:
                 "models": ["gpt2", "llama-2"],
             },
         }
     def _setup_input_handlers(self) -> None:
-        """Register handlers for different input data types."""
         self.input_handlers: Dict[str, Callable[[Any], Dict[str, Any]]] = {
             "text": self.handle_text,
             "pdf": self.handle_pdf,
@@ -78,20 +88,23 @@ class SyntheticDataGenerator:
             "api": self.handle_api,
             "db": self.handle_db,
         }
     def _initialize_session_state(self) -> None:
-        """Initialize Streamlit session state with default configuration."""
         defaults: Dict[str, Any] = {
             "config": {
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
                 "temperature": DEFAULT_TEMPERATURE,
-                "num_examples": 3,  # Default number of synthetic examples
             },
             "api_key": "",
-            "inputs": [],         # List to store input sources
-            "synthetic_data": None,  # Generated synthetic data output
-            "error_logs": [],     # To store error messages
         }
         for key, value in defaults.items():
             if key not in st.session_state:
@@ -113,18 +126,19 @@ class SyntheticDataGenerator:
                 st.session_state.config["num_examples"] = int(params["num_examples"][0])
             except ValueError:
                 pass
     def log_error(self, message: str) -> None:
-        """Log an error message to session state and display it."""
         st.session_state.error_logs.append(message)
         st.error(message)
     # ----- Input Handlers -----
     def handle_text(self, text: str) -> Dict[str, Any]:
-        """Process plain text input."""
         return {"data": text, "source": "text"}
-    def handle_pdf(self, file) -> Dict[str, Any]:
         """Extract text from a PDF file."""
         try:
             with pdfplumber.open(file) as pdf:
@@ -133,17 +147,16 @@ class SyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"PDF Processing Error: {e}")
             return {"data": "", "source": "pdf"}
-    def handle_csv(self, file) -> Dict[str, Any]:
-        """Process a CSV file by converting it to JSON."""
         try:
             df = pd.read_csv(file)
-            json_data = df.to_json(orient="records")
-            return {"data": json_data, "source": "csv"}
         except Exception as e:
             self.log_error(f"CSV Processing Error: {e}")
             return {"data": "", "source": "csv"}
     def handle_api(self, config: Dict[str, str]) -> Dict[str, Any]:
         """Fetch data from an API endpoint."""
         try:
@@ -153,9 +166,9 @@ class SyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"API Processing Error: {e}")
             return {"data": "", "source": "api"}
     def handle_db(self, config: Dict[str, str]) -> Dict[str, Any]:
-        """Query a database using the provided connection string and SQL query."""
         try:
             engine = sqlalchemy.create_engine(config["connection"])
             with engine.connect() as conn:
@@ -165,19 +178,18 @@ class SyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"Database Processing Error: {e}")
             return {"data": "", "source": "db"}
     def aggregate_inputs(self) -> str:
-        """Combine all input sources into a single aggregated string."""
-        aggregated_data = ""
         for item in st.session_state.inputs:
-            aggregated_data += f"Source: {item.get('source', 'unknown')}\n"
-            aggregated_data += item.get("data", "") + "\n\n"
-        return aggregated_data.strip()
     def build_prompt(self) -> str:
         """
-        Build the complete prompt using the custom template, aggregated inputs,
-        and the number of examples.
         """
         data = self.aggregate_inputs()
         num_examples = st.session_state.config.get("num_examples", 3)
@@ -185,50 +197,52 @@ class SyntheticDataGenerator:
         st.write("### Built Prompt")
         st.write(prompt)
         return prompt
     def generate_synthetic_data(self) -> bool:
         """
-        Generate synthetic training examples by sending the built prompt to the selected LLM provider.
         """
         api_key: str = st.session_state.api_key
         if not api_key:
             self.log_error("API key is missing!")
             return False
         provider_name: str = st.session_state.config["provider"]
         provider_cfg: Dict[str, Any] = self.providers.get(provider_name, {})
         if not provider_cfg:
             self.log_error(f"Provider {provider_name} is not configured.")
             return False
         client_initializer: Callable[[str], Any] = provider_cfg["client"]
         client = client_initializer(api_key)
         model: str = st.session_state.config["model"]
         temperature: float = st.session_state.config["temperature"]
         prompt: str = self.build_prompt()
         st.info(f"Using **{provider_name}** with model **{model}** at temperature **{temperature:.2f}**")
         try:
             if provider_name == "HuggingFace":
                 response = self._huggingface_inference(client, prompt, model)
             else:
                 response = self._standard_inference(client, prompt, model, temperature)
             st.write("### Raw API Response")
             st.write(response)
             synthetic_examples = self._parse_response(response, provider_name)
             st.write("### Parsed Synthetic Data")
             st.write(synthetic_examples)
             st.session_state.synthetic_data = synthetic_examples
             return True
         except Exception as e:
             self.log_error(f"Generation failed: {e}")
             return False
     def _standard_inference(self, client: Any, prompt: str, model: str, temperature: float) -> Any:
-        """Inference method for providers using an OpenAI-compatible API."""
         try:
             st.write("Sending prompt via standard inference...")
             result = client.chat.completions.create(
@@ -241,9 +255,11 @@ class SyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"Standard Inference Error: {e}")
             return None
     def _huggingface_inference(self, client: Dict[str, Any], prompt: str, model: str) -> Any:
-        """Inference method for the Hugging Face Inference API."""
         try:
             st.write("Sending prompt to HuggingFace API...")
             response = requests.post(
@@ -258,38 +274,39 @@ class SyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"HuggingFace Inference Error: {e}")
             return None
     def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
         Parse the LLM response and return a list of synthetic training examples.
-        Expects the response to be JSON formatted; if JSON decoding fails,
-        uses ast.literal_eval as a fallback.
         """
         st.write("Parsing response for provider:", provider)
         try:
             if provider == "HuggingFace":
                 if isinstance(response, list) and response and "generated_text" in response[0]:
                     raw_text = response[0]["generated_text"]
                 else:
                     self.log_error("Unexpected HuggingFace response format.")
                     return []
             else:
                 if response and hasattr(response, "choices") and response.choices:
                     raw_text = response.choices[0].message.content
                 else:
                     self.log_error("Unexpected response format from provider.")
                     return []
             try:
                 examples = json.loads(raw_text)
             except json.JSONDecodeError as e:
-                self.log_error(f"JSON Parsing Error: {e}. Attempting fallback with ast.literal_eval. Raw output: {raw_text}")
                 try:
                     examples = ast.literal_eval(raw_text)
                 except Exception as e2:
                     self.log_error(f"ast.literal_eval failed: {e2}")
                     return []
             if isinstance(examples, list):
                 return examples
             else:
@@ -300,41 +317,45 @@ class SyntheticDataGenerator:
             return []
-# ============ UI Components ============
 def config_ui(generator: SyntheticDataGenerator) -> None:
-    """Display configuration options in the sidebar and update URL query parameters."""
     with st.sidebar:
         st.header("Configuration")
-        # Retrieve query parameters (if any)
         params = st.experimental_get_query_params()
         default_provider = params.get("provider", ["OpenAI"])[0]
         default_model = params.get("model", ["gpt-4-turbo"])[0]
         default_temperature = float(params.get("temperature", [DEFAULT_TEMPERATURE])[0])
         default_num_examples = int(params.get("num_examples", [3])[0])
         provider_options = list(generator.providers.keys())
-        provider = st.selectbox("Select Provider", provider_options,
-                                index=provider_options.index(default_provider) if default_provider in provider_options else 0)
         st.session_state.config["provider"] = provider
         provider_cfg = generator.providers[provider]
         model_options = provider_cfg["models"]
         model = st.selectbox("Select Model", model_options,
-                             index=model_options.index(default_model) if default_model in model_options else 0)
         st.session_state.config["model"] = model
         temperature = st.slider("Temperature", 0.0, 1.0, default_temperature)
         st.session_state.config["temperature"] = temperature
-        num_examples = st.number_input("Number of Training Examples", min_value=1, max_value=10,
                                        value=default_num_examples, step=1)
         st.session_state.config["num_examples"] = num_examples
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
-        # Update URL query parameters using the new API (st.set_query_params)
         st.set_query_params(
             provider=st.session_state.config["provider"],
             model=st.session_state.config["model"],
@@ -343,10 +364,10 @@ def config_ui(generator: SyntheticDataGenerator) -> None:
         )
 def input_ui(generator: SyntheticDataGenerator) -> None:
-    """Display input data source options using tabs."""
     st.subheader("Input Data Sources")
     tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])
     with tabs[0]:
         text_input = st.text_area("Enter text input", height=150)
         if st.button("Add Text Input", key="text_input"):
@@ -355,19 +376,19 @@ def input_ui(generator: SyntheticDataGenerator) -> None:
                 st.success("Text input added!")
             else:
                 st.warning("Empty text input.")
     with tabs[1]:
         pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
         if pdf_file is not None:
             st.session_state.inputs.append(generator.handle_pdf(pdf_file))
             st.success("PDF input added!")
     with tabs[2]:
         csv_file = st.file_uploader("Upload CSV", type=["csv"])
         if csv_file is not None:
             st.session_state.inputs.append(generator.handle_csv(csv_file))
             st.success("CSV input added!")
     with tabs[3]:
         api_url = st.text_input("API Endpoint URL")
         api_headers = st.text_area("API Headers (JSON format, optional)", height=100)
@@ -380,7 +401,7 @@ def input_ui(generator: SyntheticDataGenerator) -> None:
                 generator.log_error(f"Invalid JSON for API Headers: {e}")
             st.session_state.inputs.append(generator.handle_api({"url": api_url, "headers": headers}))
             st.success("API input added!")
     with tabs[4]:
         db_conn = st.text_input("Database Connection String")
         db_query = st.text_area("Database Query", height=100)
@@ -389,12 +410,12 @@ def input_ui(generator: SyntheticDataGenerator) -> None:
             st.success("Database input added!")
 def output_ui(generator: SyntheticDataGenerator) -> None:
-    """Display the generated synthetic data and provide download options (JSON and CSV)."""
     st.subheader("Synthetic Data Output")
     if st.session_state.synthetic_data:
         st.write("### Generated Training Examples")
         st.write(st.session_state.synthetic_data)
         # Download as JSON
         st.download_button(
             "Download as JSON",
@@ -402,7 +423,7 @@ def output_ui(generator: SyntheticDataGenerator) -> None:
             file_name="synthetic_data.json",
             mime="application/json"
         )
         # Download as CSV
         try:
             df = pd.DataFrame(st.session_state.synthetic_data)
@@ -419,7 +440,7 @@ def output_ui(generator: SyntheticDataGenerator) -> None:
         st.info("No synthetic data generated yet.")
 def logs_ui() -> None:
-    """Display error logs and debugging information in an expandable section."""
     with st.expander("Error Logs & Debug Info", expanded=False):
         if st.session_state.error_logs:
             for log in st.session_state.error_logs:
@@ -434,21 +455,20 @@ def main() -> None:
     st.markdown(
         """
         Welcome to the Advanced Synthetic Data Generator. This tool creates synthetic training examples
-        for fine-tuning models. Configure your provider in the sidebar, add input data, and click the button
-        below to generate synthetic data.
         """
     )
-    # Initialize generator and display configuration UI
     generator = SyntheticDataGenerator()
     config_ui(generator)
     st.header("1. Input Data")
     input_ui(generator)
     if st.button("Clear All Inputs"):
         st.session_state.inputs = []
         st.success("All inputs have been cleared!")
     st.header("2. Generate Synthetic Data")
     if st.button("Generate Synthetic Data", key="generate_data"):
         with st.spinner("Generating synthetic data..."):
@@ -456,10 +476,10 @@ def main() -> None:
                 st.success("Synthetic data generated successfully!")
             else:
                 st.error("Data generation failed. Check logs for details.")
     st.header("3. Output")
     output_ui(generator)
     st.header("4. Logs & Debug Information")
     logs_ui()

 import json
 import ast
+import logging
 import requests
 import streamlit as st
 import pdfplumber
 import sqlalchemy
 from typing import Any, Dict, List, Callable
+# Configure Python logging for production diagnostics.
+logger = logging.getLogger("SyntheticDataGenerator")
+logger.setLevel(logging.INFO)
+if not logger.handlers:
+    handler = logging.StreamHandler()
+    handler.setLevel(logging.INFO)
+    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
 # Provider clients – ensure these libraries are installed
 try:
     from openai import OpenAI
 except ImportError:
     groq = None
+# Constants for external APIs
 HF_API_URL: str = "https://api-inference.huggingface.co/models/"
 DEFAULT_TEMPERATURE: float = 0.1
 GROQ_MODEL: str = "mixtral-8x7b-32768"
 class SyntheticDataGenerator:
     """
+    An advanced synthetic data generator for creating fine-tuning training examples.
+    This generator uses various input sources and an LLM provider to create synthetic data.
+    Each generated example is a dictionary with 'input' and 'output' keys.
     """
     def __init__(self) -> None:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
+        # Prompt template: note the use of escaped curly braces so that literal braces are kept.
         self.custom_prompt_template: str = (
             "You are an expert in generating synthetic training data for fine-tuning. "
             "Generate {num_examples} training examples from the following data, formatted as a JSON list of dictionaries. "
             "Each dictionary must have keys 'input' and 'output'. "
+            "The examples should be clear, diverse, and based solely on the provided data. Do not add any external information.\n\n"
             "Example JSON Output:\n"
             "[{{'input': 'sample input text 1', 'output': 'sample output text 1'}}, "
             "{{'input': 'sample input text 2', 'output': 'sample output text 2'}}]\n\n"
             "Now, generate {num_examples} training examples from this data:\n{data}"
         )
     def _setup_providers(self) -> None:
         """Configure available LLM providers and their client initialization routines."""
         self.providers: Dict[str, Dict[str, Any]] = {
                 "models": ["gpt2", "llama-2"],
             },
         }
     def _setup_input_handlers(self) -> None:
+        """Register input handlers for various data types."""
         self.input_handlers: Dict[str, Callable[[Any], Dict[str, Any]]] = {
             "text": self.handle_text,
             "pdf": self.handle_pdf,
             "api": self.handle_api,
             "db": self.handle_db,
         }
     def _initialize_session_state(self) -> None:
+        """
+        Initialize the Streamlit session state with default configuration.
+        Also pre-populate configuration from URL query parameters.
+        """
         defaults: Dict[str, Any] = {
             "config": {
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
                 "temperature": DEFAULT_TEMPERATURE,
+                "num_examples": 3,
             },
             "api_key": "",
+            "inputs": [],             # List to store input sources
+            "synthetic_data": None,   # Generated synthetic training examples
+            "error_logs": [],         # To store error messages
         }
         for key, value in defaults.items():
             if key not in st.session_state:
                 st.session_state.config["num_examples"] = int(params["num_examples"][0])
             except ValueError:
                 pass
     def log_error(self, message: str) -> None:
+        """Log an error message to both Streamlit and the production logger."""
         st.session_state.error_logs.append(message)
         st.error(message)
+        logger.error(message)
     # ----- Input Handlers -----
     def handle_text(self, text: str) -> Dict[str, Any]:
+        """Return plain text input."""
         return {"data": text, "source": "text"}
+    def handle_pdf(self, file: Any) -> Dict[str, Any]:
         """Extract text from a PDF file."""
         try:
             with pdfplumber.open(file) as pdf:
         except Exception as e:
             self.log_error(f"PDF Processing Error: {e}")
             return {"data": "", "source": "pdf"}
+    def handle_csv(self, file: Any) -> Dict[str, Any]:
+        """Process CSV file by converting it to JSON."""
         try:
             df = pd.read_csv(file)
+            return {"data": df.to_json(orient="records"), "source": "csv"}
         except Exception as e:
             self.log_error(f"CSV Processing Error: {e}")
             return {"data": "", "source": "csv"}
     def handle_api(self, config: Dict[str, str]) -> Dict[str, Any]:
         """Fetch data from an API endpoint."""
         try:
         except Exception as e:
             self.log_error(f"API Processing Error: {e}")
             return {"data": "", "source": "api"}
     def handle_db(self, config: Dict[str, str]) -> Dict[str, Any]:
+        """Query a database using a connection string and SQL query."""
         try:
             engine = sqlalchemy.create_engine(config["connection"])
             with engine.connect() as conn:
         except Exception as e:
             self.log_error(f"Database Processing Error: {e}")
             return {"data": "", "source": "db"}
     def aggregate_inputs(self) -> str:
+        """Aggregate all input data sources into a single string."""
+        aggregated = ""
         for item in st.session_state.inputs:
+            aggregated += f"Source: {item.get('source', 'unknown')}\n{item.get('data', '')}\n\n"
+        return aggregated.strip()
     def build_prompt(self) -> str:
         """
+        Build the complete prompt using the custom template, aggregated inputs,
+        and the configured number of examples.
         """
         data = self.aggregate_inputs()
         num_examples = st.session_state.config.get("num_examples", 3)
         st.write("### Built Prompt")
         st.write(prompt)
         return prompt
     def generate_synthetic_data(self) -> bool:
         """
+        Generate synthetic training examples by sending the prompt to the selected LLM provider.
         """
         api_key: str = st.session_state.api_key
         if not api_key:
             self.log_error("API key is missing!")
             return False
         provider_name: str = st.session_state.config["provider"]
         provider_cfg: Dict[str, Any] = self.providers.get(provider_name, {})
         if not provider_cfg:
             self.log_error(f"Provider {provider_name} is not configured.")
             return False
         client_initializer: Callable[[str], Any] = provider_cfg["client"]
         client = client_initializer(api_key)
         model: str = st.session_state.config["model"]
         temperature: float = st.session_state.config["temperature"]
         prompt: str = self.build_prompt()
         st.info(f"Using **{provider_name}** with model **{model}** at temperature **{temperature:.2f}**")
         try:
             if provider_name == "HuggingFace":
                 response = self._huggingface_inference(client, prompt, model)
             else:
                 response = self._standard_inference(client, prompt, model, temperature)
             st.write("### Raw API Response")
             st.write(response)
             synthetic_examples = self._parse_response(response, provider_name)
             st.write("### Parsed Synthetic Data")
             st.write(synthetic_examples)
             st.session_state.synthetic_data = synthetic_examples
             return True
         except Exception as e:
             self.log_error(f"Generation failed: {e}")
             return False
     def _standard_inference(self, client: Any, prompt: str, model: str, temperature: float) -> Any:
+        """
+        Inference method for providers with an OpenAI-compatible API.
+        """
         try:
             st.write("Sending prompt via standard inference...")
             result = client.chat.completions.create(
         except Exception as e:
             self.log_error(f"Standard Inference Error: {e}")
             return None
     def _huggingface_inference(self, client: Dict[str, Any], prompt: str, model: str) -> Any:
+        """
+        Inference method for the Hugging Face Inference API.
+        """
         try:
             st.write("Sending prompt to HuggingFace API...")
             response = requests.post(
         except Exception as e:
             self.log_error(f"HuggingFace Inference Error: {e}")
             return None
     def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
         Parse the LLM response and return a list of synthetic training examples.
+        Attempts JSON decoding first and falls back to ast.literal_eval.
         """
         st.write("Parsing response for provider:", provider)
         try:
             if provider == "HuggingFace":
+                # Expect response to be a list with a key "generated_text"
                 if isinstance(response, list) and response and "generated_text" in response[0]:
                     raw_text = response[0]["generated_text"]
                 else:
                     self.log_error("Unexpected HuggingFace response format.")
                     return []
             else:
+                # For OpenAI/Groq, look for choices[0].message.content
                 if response and hasattr(response, "choices") and response.choices:
                     raw_text = response.choices[0].message.content
                 else:
                     self.log_error("Unexpected response format from provider.")
                     return []
             try:
                 examples = json.loads(raw_text)
             except json.JSONDecodeError as e:
+                self.log_error(f"JSON Parsing Error: {e}. Fallback with ast.literal_eval. Raw output: {raw_text}")
                 try:
                     examples = ast.literal_eval(raw_text)
                 except Exception as e2:
                     self.log_error(f"ast.literal_eval failed: {e2}")
                     return []
             if isinstance(examples, list):
                 return examples
             else:
             return []
+# =================== UI Components ===================
 def config_ui(generator: SyntheticDataGenerator) -> None:
+    """
+    Display configuration options in the sidebar.
+    Updates URL query parameters using st.set_query_params.
+    """
     with st.sidebar:
         st.header("Configuration")
         params = st.experimental_get_query_params()
         default_provider = params.get("provider", ["OpenAI"])[0]
         default_model = params.get("model", ["gpt-4-turbo"])[0]
         default_temperature = float(params.get("temperature", [DEFAULT_TEMPERATURE])[0])
         default_num_examples = int(params.get("num_examples", [3])[0])
         provider_options = list(generator.providers.keys())
+        provider = st.selectbox("Select Provider", provider_options,
+                                index=provider_options.index(default_provider)
+                                if default_provider in provider_options else 0)
         st.session_state.config["provider"] = provider
         provider_cfg = generator.providers[provider]
         model_options = provider_cfg["models"]
         model = st.selectbox("Select Model", model_options,
+                             index=model_options.index(default_model)
+                             if default_model in model_options else 0)
         st.session_state.config["model"] = model
         temperature = st.slider("Temperature", 0.0, 1.0, default_temperature)
         st.session_state.config["temperature"] = temperature
+        num_examples = st.number_input("Number of Training Examples", min_value=1, max_value=10,
                                        value=default_num_examples, step=1)
         st.session_state.config["num_examples"] = num_examples
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
+        # Update URL query parameters (shareable configuration)
         st.set_query_params(
             provider=st.session_state.config["provider"],
             model=st.session_state.config["model"],
         )
 def input_ui(generator: SyntheticDataGenerator) -> None:
+    """Display input data source options in tabs."""
     st.subheader("Input Data Sources")
     tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])
     with tabs[0]:
         text_input = st.text_area("Enter text input", height=150)
         if st.button("Add Text Input", key="text_input"):
                 st.success("Text input added!")
             else:
                 st.warning("Empty text input.")
     with tabs[1]:
         pdf_file = st.file_uploader("Upload PDF", type=["pdf"])
         if pdf_file is not None:
             st.session_state.inputs.append(generator.handle_pdf(pdf_file))
             st.success("PDF input added!")
     with tabs[2]:
         csv_file = st.file_uploader("Upload CSV", type=["csv"])
         if csv_file is not None:
             st.session_state.inputs.append(generator.handle_csv(csv_file))
             st.success("CSV input added!")
     with tabs[3]:
         api_url = st.text_input("API Endpoint URL")
         api_headers = st.text_area("API Headers (JSON format, optional)", height=100)
                 generator.log_error(f"Invalid JSON for API Headers: {e}")
             st.session_state.inputs.append(generator.handle_api({"url": api_url, "headers": headers}))
             st.success("API input added!")
     with tabs[4]:
         db_conn = st.text_input("Database Connection String")
         db_query = st.text_area("Database Query", height=100)
             st.success("Database input added!")
 def output_ui(generator: SyntheticDataGenerator) -> None:
+    """Display the generated synthetic data and download options (JSON and CSV)."""
     st.subheader("Synthetic Data Output")
     if st.session_state.synthetic_data:
         st.write("### Generated Training Examples")
         st.write(st.session_state.synthetic_data)
         # Download as JSON
         st.download_button(
             "Download as JSON",
             file_name="synthetic_data.json",
             mime="application/json"
         )
         # Download as CSV
         try:
             df = pd.DataFrame(st.session_state.synthetic_data)
         st.info("No synthetic data generated yet.")
 def logs_ui() -> None:
+    """Display error logs and debug information in an expandable section."""
     with st.expander("Error Logs & Debug Info", expanded=False):
         if st.session_state.error_logs:
             for log in st.session_state.error_logs:
     st.markdown(
         """
         Welcome to the Advanced Synthetic Data Generator. This tool creates synthetic training examples
+        for fine-tuning models. Configure your provider in the sidebar, add input data, and generate synthetic data.
         """
     )
+    # Initialize generator and UI
     generator = SyntheticDataGenerator()
     config_ui(generator)
     st.header("1. Input Data")
     input_ui(generator)
     if st.button("Clear All Inputs"):
         st.session_state.inputs = []
         st.success("All inputs have been cleared!")
     st.header("2. Generate Synthetic Data")
     if st.button("Generate Synthetic Data", key="generate_data"):
         with st.spinner("Generating synthetic data..."):
                 st.success("Synthetic data generated successfully!")
             else:
                 st.error("Data generation failed. Check logs for details.")
     st.header("3. Output")
     output_ui(generator)
     st.header("4. Logs & Debug Information")
     logs_ui()