Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

mgbam commited on Feb 9

Commit

6bba837

verified ·

1 Parent(s): e9a68df

Update app.py

Browse files

Files changed (1) hide show

app.py +174 -148

app.py CHANGED Viewed

@@ -1,33 +1,36 @@
 import streamlit as st
 import pdfplumber
 import pandas as pd
-import requests
-import json
 from PIL import Image
 from openai import OpenAI
 import google.generative_ai as genai
 import groq
-import sqlalchemy
-from typing import Dict, Any
 # --- CONSTANTS ---
 HF_API_URL = "https://api-inference.huggingface.co/models/"
 DEFAULT_TEMPERATURE = 0.1
-MODEL = "mixtral-8x7b-32768"  # Groq model
-API_HEADERS_HEIGHT = 70  # Minimum height for st.text_area
 class SyntheticDataGenerator:
-    """Generates synthetic Q&A data from various input sources using LLMs."""
-    def __init__(self):
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
-    def _setup_providers(self):
-        """Defines the available LLM providers and their configurations."""
-        self.providers = {
             "Deepseek": {
                 "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key),
                 "models": ["deepseek-chat"],
@@ -38,7 +41,7 @@ class SyntheticDataGenerator:
             },
             "Groq": {
                 "client": lambda key: groq.Groq(api_key=key),
-                "models": [MODEL],
             },
             "HuggingFace": {
                 "client": lambda key: {"headers": {"Authorization": f"Bearer {key}"}},
@@ -50,9 +53,9 @@ class SyntheticDataGenerator:
             },
         }
-    def _setup_input_handlers(self):
-        """Defines handlers for different input data types."""
-        self.input_handlers = {
             "pdf": self.handle_pdf,
             "text": self.handle_text,
             "csv": self.handle_csv,
@@ -60,21 +63,25 @@ class SyntheticDataGenerator:
             "db": self.handle_db,
         }
-    def _initialize_session_state(self):
-        """Initializes Streamlit session state variables."""
         session_defaults = {
             "inputs": [],
             "qa_data": [],
             "processing": {"stage": "idle", "progress": 0, "errors": []},
-            "config": {"provider": "Groq", "model": MODEL, "temperature": DEFAULT_TEMPERATURE},
-            "api_key": "",  # Explicitly initialize api_key in session state
         }
         for key, value in session_defaults.items():
             if key not in st.session_state:
                 st.session_state[key] = value
-    def _configure_google_genai(self, api_key: str):
-        """Configures the Google Generative AI client."""
         try:
             genai.configure(api_key=api_key)
             return genai.GenerativeModel
@@ -83,50 +90,63 @@ class SyntheticDataGenerator:
             return None
     # --- INPUT HANDLERS ---
-    def handle_pdf(self, file):
-        """Extracts text and images from a PDF file."""
         try:
             with pdfplumber.open(file) as pdf:
                 extracted_data = []
                 for i, page in enumerate(pdf.pages):
                     page_text = page.extract_text() or ""
                     page_images = self.process_images(page)
-                    extracted_data.append(
-                        {"text": page_text, "images": page_images, "meta": {"type": "pdf", "page": i + 1}}
-                    )
                 return extracted_data
         except Exception as e:
-            self._log_error(f"PDF Error: {str(e)}")
             return []
-    def handle_text(self, text):
-        """Handles manual text input."""
         return [{"text": text, "meta": {"type": "domain", "source": "manual"}}]
-    def handle_csv(self, file):
-        """Reads a CSV file and prepares data for Q&A generation."""
         try:
             df = pd.read_csv(file)
             return [
-                {"text": "\n".join([f"{col}: {row[col]}" for col in df.columns]), "meta": {"type": "csv", "columns": list(df.columns)}}
                 for _, row in df.iterrows()
             ]
         except Exception as e:
-            self._log_error(f"CSV Error: {str(e)}")
             return []
-    def handle_api(self, config):
-        """Fetches data from an API endpoint."""
         try:
-            response = requests.get(config["url"], headers=config["headers"], timeout=10)  # Add timeout
-            response.raise_for_status()  # Raise HTTPError for bad responses
-            return [{"text": json.dumps(response.json()), "meta": {"type": "api", "endpoint": config["url"]}}]
         except requests.exceptions.RequestException as e:
-            self._log_error(f"API Error: {str(e)}")
             return []
-    def handle_db(self, config):
-        """Connects to a database and executes a query."""
         try:
             engine = sqlalchemy.create_engine(config["connection"])
             with engine.connect() as conn:
@@ -139,11 +159,11 @@ class SyntheticDataGenerator:
                     for row in result
                 ]
         except Exception as e:
-            self._log_error(f"DB Error: {str(e)}")
             return []
-    def process_images(self, page):
-        """Extracts and processes images from a PDF page."""
         images = []
         for img in page.images:
             try:
@@ -151,69 +171,70 @@ class SyntheticDataGenerator:
                 width = int(stream.get("Width", 0))
                 height = int(stream.get("Height", 0))
                 image_data = stream.get_data()
                 if width > 0 and height > 0 and image_data:
                     try:
                         image = Image.frombytes("RGB", (width, height), image_data)
                         images.append({"data": image, "meta": {"dims": (width, height)}})
                     except Exception as e:
-                        self._log_error(f"Image Creation Error: {str(e)}. Width: {width}, Height: {height}")
                 else:
-                    self._log_error(
-                        f"Image Error: Insufficient data or invalid dimensions (w={width}, h={height})"
-                    )
             except Exception as e:
-                self._log_error(f"Image Extraction Error: {str(e)}")
         return images
     # --- LLM INFERENCE ---
     def generate(self, api_key: str) -> bool:
-        """Generates Q&A pairs using the selected LLM provider."""
-        try:
-            if not api_key:
-                st.error("API Key cannot be empty.")
-                return False
-            provider_cfg = self.providers[st.session_state.config["provider"]]
             client_initializer = provider_cfg["client"]
-            if st.session_state.config["provider"] == "Google":
                 client = client_initializer(api_key)
                 if not client:
-                    return False  # Google config failed
             else:
                 client = client_initializer(api_key)
             for i, input_data in enumerate(st.session_state.inputs):
                 st.session_state.processing["progress"] = (i + 1) / len(st.session_state.inputs)
-                # Debugging: Display input data
                 st.write("--- Input Data ---")
                 st.write(input_data["text"])
-                if st.session_state.config["provider"] == "HuggingFace":
                     response = self._huggingface_inference(client, input_data)
-                elif st.session_state.config["provider"] == "Google":
                     response = self._google_inference(client, input_data)
                 else:
                     response = self._standard_inference(client, input_data)
                 if response:
-                    # Debugging: Display raw response
                     st.write("--- Raw Response ---")
                     st.write(response)
-                    st.session_state.qa_data.extend(self._parse_response(response, st.session_state.config["provider"]))
             return True
         except Exception as e:
-            self._log_error(f"Generation Error: {str(e)}")
             return False
-    def _standard_inference(self, client, input_data):
-        """Performs inference using OpenAI-compatible API."""
         try:
             return client.chat.completions.create(
                 model=st.session_state.config["model"],
@@ -224,8 +245,8 @@ class SyntheticDataGenerator:
             self._log_error(f"OpenAI Inference Error: {e}")
             return None
-    def _huggingface_inference(self, client, input_data):
-        """Performs inference using Hugging Face Inference API."""
         try:
             response = requests.post(
                 HF_API_URL + st.session_state.config["model"],
@@ -238,13 +259,15 @@ class SyntheticDataGenerator:
             self._log_error(f"Hugging Face Inference Error: {e}")
             return None
-    def _google_inference(self, client, input_data):
-        """Performs inference using Google Generative AI API."""
         try:
             model = client(st.session_state.config["model"])
             response = model.generate_content(
                 self._build_prompt(input_data),
-                generation_config=genai.types.GenerationConfig(temperature=st.session_state.config["temperature"]),
             )
             return response
         except Exception as e:
@@ -252,172 +275,175 @@ class SyntheticDataGenerator:
             return None
     # --- PROMPT ENGINEERING ---
-    def _build_prompt(self, input_data):
-        """Builds the prompt for the LLM based on the input data type."""
-        base = (
             "You are an expert in extracting question and answer pairs from documents. "
             "Generate 3 Q&A pairs from the following data, formatted as a JSON list of dictionaries.\n"
             "Each dictionary must have the keys 'question' and 'answer'.\n"
-            "The 'question' should be clear and concise, and the 'answer' should directly answer the question using only "
-            "information from the data. Do not hallucinate or invent information.\n"
-            "Answer from the exact same document, not outside from the document\n"
             "Example JSON Output:\n"
             '[{"question": "What is the capital of France?", "answer": "The capital of France is Paris."}, '
             '{"question": "What is the highest mountain in the world?", "answer": "The highest mountain in the world is Mount Everest."}, '
             '{"question": "What is the chemical symbol for gold?", "answer": "The chemical symbol for gold is Au."}]\n'
             "Now, generate 3 Q&A pairs from this data:\n"
         )
-        if input_data["meta"]["type"] == "csv":
-            return base + "Data:\n" + input_data["text"]
-        elif input_data["meta"]["type"] == "api":
-            return base + "API response:\n" + input_data["text"]
-        return base + input_data["text"]
     # --- RESPONSE PARSING ---
-    def _parse_response(self, response: Any, provider: str) -> list[dict[str, str]]:
-        """Parses the LLM response into a list of Q&A pairs."""
         try:
             response_text = ""
             if provider == "HuggingFace":
-                response_text = response[0]["generated_text"]
-                return response_text
             elif provider == "Google":
                 response_text = response.text.strip()
             else:  # OpenAI, Deepseek, Groq
                 if not response or not response.choices or not response.choices[0].message.content:
                     self._log_error("Empty or malformed response from LLM.")
                     return []
                 response_text = response.choices[0].message.content
             try:
                 json_output = json.loads(response_text)
-                if isinstance(json_output, list):
-                    qa_pairs = json_output
-                elif isinstance(json_output, dict) and "questionList" in json_output:
-                    qa_pairs = json_output["questionList"]
-                else:
-                    self._log_error(f"Unexpected JSON structure: {response_text}")
-                    return []
-                if not isinstance(qa_pairs, list):
-                    self._log_error(f"Expected a list of QA pairs, but got: {type(qa_pairs)}")
-                    return []
-                for pair in qa_pairs:
-                    if not isinstance(pair, dict) or "question" not in pair or "answer" not in pair:
-                        self._log_error(f"Invalid QA pair structure: {pair}")
-                        return []
-                return qa_pairs
-            except json.JSONDecodeError as e:
-                self._log_error(f"JSON Parse Error: {e}. Raw Response: {response_text}")
-                return []
         except Exception as e:
             self._log_error(f"Parse Error: {e}. Raw Response: {response}")
             return []
-    def _log_error(self, message):
-        """Logs an error message to Streamlit session state and displays it."""
         st.session_state.processing["errors"].append(message)
         st.error(message)
 # --- STREAMLIT UI COMPONENTS ---
-def input_sidebar(gen: SyntheticDataGenerator) -> str:
-    """Creates the input sidebar in the Streamlit UI."""
     with st.sidebar:
         st.header("⚙️ Configuration")
-        provider = st.selectbox("Provider", list(gen.providers.keys()))
-        st.session_state.config["provider"] = provider  # Update session state immediately
-        provider_cfg = gen.providers[provider]
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state["api_key"] = api_key
         model = st.selectbox("Model", provider_cfg["models"])
-        st.session_state.config["model"] = model  # Update model selection
-        temp = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
-        st.session_state.config["temperature"] = temp  # Update temperature
-        # Input Source Selection
         st.header("🔗 Data Sources")
-        input_type = st.selectbox("Input Type", list(gen.input_handlers.keys()))
         if input_type == "text":
             domain_input = st.text_area("Domain Knowledge", height=150)
             if st.button("Add Domain Input"):
-                st.session_state.inputs.append(gen.input_handlers["text"](domain_input)[0])
         elif input_type == "csv":
             csv_file = st.file_uploader("Upload CSV", type=["csv"])
             if csv_file:
-                st.session_state.inputs.extend(gen.input_handlers["csv"](csv_file))
         elif input_type == "api":
             api_url = st.text_input("API Endpoint")
             api_headers = st.text_area("API Headers (JSON format, optional)", height=API_HEADERS_HEIGHT)
             headers = {}
-            try:
-                if api_headers:
                     headers = json.loads(api_headers)
-            except json.JSONDecodeError:
-                st.error("Invalid JSON format for API headers.")
             if st.button("Add API Input"):
-                st.session_state.inputs.extend(gen.input_handlers["api"]({"url": api_url, "headers": headers}))
         elif input_type == "db":
             db_connection = st.text_input("Database Connection String")
             db_query = st.text_area("Database Query")
             db_table = st.text_input("Table Name (optional)")
             if st.button("Add DB Input"):
-                st.session_state.inputs.extend(
-                    gen.input_handlers["db"]({"connection": db_connection, "query": db_query, "table": db_table})
-                )
-        return api_key
-def main_display(gen: SyntheticDataGenerator):
-    """Creates the main display area in the Streamlit UI."""
     st.title("🚀 Enterprise Synthetic Data Factory")
     col1, col2 = st.columns([3, 1])
     with col1:
         pdf_file = st.file_uploader("Upload Document", type=["pdf"])
         if pdf_file:
-            st.session_state.inputs.extend(gen.input_handlers["pdf"](pdf_file))
     with col2:
         if st.button("Start Generation"):
-            with st.status("Processing..."):
                 if not st.session_state["api_key"]:
                     st.error("Please provide an API Key.")
                 else:
-                    gen.generate(st.session_state["api_key"])
     if st.session_state.qa_data:
         st.header("Generated Data")
         df = pd.DataFrame(st.session_state.qa_data)
         st.dataframe(df)
         st.download_button("Export CSV", df.to_csv(index=False), "synthetic_data.csv")
-def main():
     """Main function to run the Streamlit application."""
-    gen = SyntheticDataGenerator()
-    api_key = input_sidebar(gen)
-    main_display(gen)
 if __name__ == "__main__":
-    main()

+import json
+import requests
 import streamlit as st
 import pdfplumber
 import pandas as pd
+import sqlalchemy
 from PIL import Image
+from typing import Any, Dict, List
+# Provider clients
 from openai import OpenAI
 import google.generative_ai as genai
 import groq
 # --- CONSTANTS ---
 HF_API_URL = "https://api-inference.huggingface.co/models/"
 DEFAULT_TEMPERATURE = 0.1
+GROQ_MODEL = "mixtral-8x7b-32768"  # Groq model
+API_HEADERS_HEIGHT = 70  # Height for the API headers text area
 class SyntheticDataGenerator:
+    """
+    Generates synthetic Q&A data from various input sources using multiple LLM providers.
+    """
+    def __init__(self) -> None:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
+    def _setup_providers(self) -> None:
+        """Configure available LLM providers and their client initializations."""
+        self.providers: Dict[str, Dict[str, Any]] = {
             "Deepseek": {
                 "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key),
                 "models": ["deepseek-chat"],
             },
             "Groq": {
                 "client": lambda key: groq.Groq(api_key=key),
+                "models": [GROQ_MODEL],
             },
             "HuggingFace": {
                 "client": lambda key: {"headers": {"Authorization": f"Bearer {key}"}},
             },
         }
+    def _setup_input_handlers(self) -> None:
+        """Define handlers for different input data types."""
+        self.input_handlers: Dict[str, Any] = {
             "pdf": self.handle_pdf,
             "text": self.handle_text,
             "csv": self.handle_csv,
             "db": self.handle_db,
         }
+    def _initialize_session_state(self) -> None:
+        """Initialize Streamlit session state with default configurations."""
         session_defaults = {
             "inputs": [],
             "qa_data": [],
             "processing": {"stage": "idle", "progress": 0, "errors": []},
+            "config": {
+                "provider": "Groq",
+                "model": GROQ_MODEL,
+                "temperature": DEFAULT_TEMPERATURE,
+            },
+            "api_key": "",  # Explicitly initialize the API key
         }
         for key, value in session_defaults.items():
             if key not in st.session_state:
                 st.session_state[key] = value
+    def _configure_google_genai(self, api_key: str) -> Any:
+        """Configure and return the Google Generative AI client."""
         try:
             genai.configure(api_key=api_key)
             return genai.GenerativeModel
             return None
     # --- INPUT HANDLERS ---
+    def handle_pdf(self, file) -> List[Dict[str, Any]]:
+        """
+        Extract text and images from a PDF file.
+        Returns:
+            A list of dictionaries containing text, images, and metadata.
+        """
         try:
             with pdfplumber.open(file) as pdf:
                 extracted_data = []
                 for i, page in enumerate(pdf.pages):
                     page_text = page.extract_text() or ""
                     page_images = self.process_images(page)
+                    extracted_data.append({
+                        "text": page_text,
+                        "images": page_images,
+                        "meta": {"type": "pdf", "page": i + 1},
+                    })
                 return extracted_data
         except Exception as e:
+            self._log_error(f"PDF Error: {e}")
             return []
+    def handle_text(self, text: str) -> List[Dict[str, Any]]:
+        """Handle manual text input."""
         return [{"text": text, "meta": {"type": "domain", "source": "manual"}}]
+    def handle_csv(self, file) -> List[Dict[str, Any]]:
+        """Process a CSV file and format the data for Q&A generation."""
         try:
             df = pd.read_csv(file)
             return [
+                {
+                    "text": "\n".join([f"{col}: {row[col]}" for col in df.columns]),
+                    "meta": {"type": "csv", "columns": list(df.columns)},
+                }
                 for _, row in df.iterrows()
             ]
         except Exception as e:
+            self._log_error(f"CSV Error: {e}")
             return []
+    def handle_api(self, config: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Fetch data from an API endpoint and format it for processing."""
         try:
+            response = requests.get(config["url"], headers=config["headers"], timeout=10)
+            response.raise_for_status()
+            return [{
+                "text": json.dumps(response.json()),
+                "meta": {"type": "api", "endpoint": config["url"]},
+            }]
         except requests.exceptions.RequestException as e:
+            self._log_error(f"API Error: {e}")
             return []
+    def handle_db(self, config: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """Connect to a database, execute a query, and format the results."""
         try:
             engine = sqlalchemy.create_engine(config["connection"])
             with engine.connect() as conn:
                     for row in result
                 ]
         except Exception as e:
+            self._log_error(f"DB Error: {e}")
             return []
+    def process_images(self, page) -> List[Dict[str, Any]]:
+        """Extract and process images from a PDF page."""
         images = []
         for img in page.images:
             try:
                 width = int(stream.get("Width", 0))
                 height = int(stream.get("Height", 0))
                 image_data = stream.get_data()
                 if width > 0 and height > 0 and image_data:
                     try:
                         image = Image.frombytes("RGB", (width, height), image_data)
                         images.append({"data": image, "meta": {"dims": (width, height)}})
                     except Exception as e:
+                        self._log_error(f"Image Creation Error: {e} (Width: {width}, Height: {height})")
                 else:
+                    self._log_error(f"Image Error: Insufficient data or invalid dimensions (w={width}, h={height})")
             except Exception as e:
+                self._log_error(f"Image Extraction Error: {e}")
         return images
     # --- LLM INFERENCE ---
     def generate(self, api_key: str) -> bool:
+        """
+        Generate Q&A pairs using the selected LLM provider.
+        Iterates over all the input data, calls the appropriate inference method,
+        and aggregates the generated Q&A pairs into session state.
+        """
+        if not api_key:
+            st.error("API Key cannot be empty.")
+            return False
+        try:
+            provider_name = st.session_state.config["provider"]
+            provider_cfg = self.providers[provider_name]
             client_initializer = provider_cfg["client"]
+            # Initialize the client
+            if provider_name == "Google":
                 client = client_initializer(api_key)
                 if not client:
+                    return False
             else:
                 client = client_initializer(api_key)
             for i, input_data in enumerate(st.session_state.inputs):
                 st.session_state.processing["progress"] = (i + 1) / len(st.session_state.inputs)
                 st.write("--- Input Data ---")
                 st.write(input_data["text"])
+                if provider_name == "HuggingFace":
                     response = self._huggingface_inference(client, input_data)
+                elif provider_name == "Google":
                     response = self._google_inference(client, input_data)
                 else:
                     response = self._standard_inference(client, input_data)
                 if response:
                     st.write("--- Raw Response ---")
                     st.write(response)
+                    parsed_response = self._parse_response(response, provider_name)
+                    if parsed_response:
+                        st.session_state.qa_data.extend(parsed_response)
             return True
         except Exception as e:
+            self._log_error(f"Generation Error: {e}")
             return False
+    def _standard_inference(self, client: Any, input_data: Dict[str, Any]) -> Any:
+        """Perform inference using an OpenAI-compatible API."""
         try:
             return client.chat.completions.create(
                 model=st.session_state.config["model"],
             self._log_error(f"OpenAI Inference Error: {e}")
             return None
+    def _huggingface_inference(self, client: Dict[str, Any], input_data: Dict[str, Any]) -> Any:
+        """Perform inference using the Hugging Face Inference API."""
         try:
             response = requests.post(
                 HF_API_URL + st.session_state.config["model"],
             self._log_error(f"Hugging Face Inference Error: {e}")
             return None
+    def _google_inference(self, client: Any, input_data: Dict[str, Any]) -> Any:
+        """Perform inference using the Google Generative AI API."""
         try:
             model = client(st.session_state.config["model"])
             response = model.generate_content(
                 self._build_prompt(input_data),
+                generation_config=genai.types.GenerationConfig(
+                    temperature=st.session_state.config["temperature"]
+                ),
             )
             return response
         except Exception as e:
             return None
     # --- PROMPT ENGINEERING ---
+    def _build_prompt(self, input_data: Dict[str, Any]) -> str:
+        """
+        Build the prompt for the LLM based on the input data.
+        The prompt instructs the LLM to extract 3 Q&A pairs in JSON format.
+        """
+        base_prompt = (
             "You are an expert in extracting question and answer pairs from documents. "
             "Generate 3 Q&A pairs from the following data, formatted as a JSON list of dictionaries.\n"
             "Each dictionary must have the keys 'question' and 'answer'.\n"
+            "The 'question' should be clear and concise, and the 'answer' should directly answer the question "
+            "using only information from the provided data. Do not hallucinate or invent information.\n"
+            "Answer using the exact information from the document, not external knowledge.\n"
             "Example JSON Output:\n"
             '[{"question": "What is the capital of France?", "answer": "The capital of France is Paris."}, '
             '{"question": "What is the highest mountain in the world?", "answer": "The highest mountain in the world is Mount Everest."}, '
             '{"question": "What is the chemical symbol for gold?", "answer": "The chemical symbol for gold is Au."}]\n'
             "Now, generate 3 Q&A pairs from this data:\n"
         )
+        data_type = input_data["meta"].get("type", "text")
+        if data_type == "csv":
+            return base_prompt + "Data:\n" + input_data["text"]
+        elif data_type == "api":
+            return base_prompt + "API response:\n" + input_data["text"]
+        return base_prompt + input_data["text"]
     # --- RESPONSE PARSING ---
+    def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
+        """
+        Parse the LLM response into a list of Q&A pairs.
+        Expects the response to be a JSON formatted string.
+        """
         try:
             response_text = ""
             if provider == "HuggingFace":
+                response_text = response[0].get("generated_text", "")
             elif provider == "Google":
                 response_text = response.text.strip()
             else:  # OpenAI, Deepseek, Groq
                 if not response or not response.choices or not response.choices[0].message.content:
                     self._log_error("Empty or malformed response from LLM.")
                     return []
                 response_text = response.choices[0].message.content
             try:
                 json_output = json.loads(response_text)
+            except json.JSONDecodeError as e:
+                self._log_error(f"JSON Parse Error: {e}. Raw Response: {response_text}")
+                return []
+            if isinstance(json_output, list):
+                qa_pairs = json_output
+            elif isinstance(json_output, dict) and "questionList" in json_output:
+                qa_pairs = json_output["questionList"]
+            else:
+                self._log_error(f"Unexpected JSON structure: {response_text}")
+                return []
+            if not isinstance(qa_pairs, list):
+                self._log_error(f"Expected a list of QA pairs, but got: {type(qa_pairs)}")
+                return []
+            for pair in qa_pairs:
+                if not isinstance(pair, dict) or "question" not in pair or "answer" not in pair:
+                    self._log_error(f"Invalid QA pair structure: {pair}")
+                    return []
+            return qa_pairs
         except Exception as e:
             self._log_error(f"Parse Error: {e}. Raw Response: {response}")
             return []
+    def _log_error(self, message: str) -> None:
+        """Log an error message to the session state and display it."""
         st.session_state.processing["errors"].append(message)
         st.error(message)
 # --- STREAMLIT UI COMPONENTS ---
+def input_sidebar(generator: SyntheticDataGenerator) -> str:
+    """Create the input sidebar in the Streamlit UI."""
     with st.sidebar:
         st.header("⚙️ Configuration")
+        provider = st.selectbox("Provider", list(generator.providers.keys()))
+        st.session_state.config["provider"] = provider  # Update provider in session state
+        provider_cfg = generator.providers[provider]
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state["api_key"] = api_key
         model = st.selectbox("Model", provider_cfg["models"])
+        st.session_state.config["model"] = model
+        temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
+        st.session_state.config["temperature"] = temperature
+        # Data Source Input
         st.header("🔗 Data Sources")
+        input_type = st.selectbox("Input Type", list(generator.input_handlers.keys()))
         if input_type == "text":
             domain_input = st.text_area("Domain Knowledge", height=150)
             if st.button("Add Domain Input"):
+                st.session_state.inputs.append(generator.input_handlers["text"](domain_input)[0])
         elif input_type == "csv":
             csv_file = st.file_uploader("Upload CSV", type=["csv"])
             if csv_file:
+                st.session_state.inputs.extend(generator.input_handlers["csv"](csv_file))
         elif input_type == "api":
             api_url = st.text_input("API Endpoint")
             api_headers = st.text_area("API Headers (JSON format, optional)", height=API_HEADERS_HEIGHT)
             headers = {}
+            if api_headers:
+                try:
                     headers = json.loads(api_headers)
+                except json.JSONDecodeError:
+                    st.error("Invalid JSON format for API headers.")
             if st.button("Add API Input"):
+                st.session_state.inputs.extend(generator.input_handlers["api"]({"url": api_url, "headers": headers}))
         elif input_type == "db":
             db_connection = st.text_input("Database Connection String")
             db_query = st.text_area("Database Query")
             db_table = st.text_input("Table Name (optional)")
             if st.button("Add DB Input"):
+                st.session_state.inputs.extend(generator.input_handlers["db"]({
+                    "connection": db_connection,
+                    "query": db_query,
+                    "table": db_table
+                }))
+    return api_key
+def main_display(generator: SyntheticDataGenerator) -> None:
+    """Create the main display area in the Streamlit UI."""
     st.title("🚀 Enterprise Synthetic Data Factory")
     col1, col2 = st.columns([3, 1])
     with col1:
         pdf_file = st.file_uploader("Upload Document", type=["pdf"])
         if pdf_file:
+            st.session_state.inputs.extend(generator.input_handlers["pdf"](pdf_file))
     with col2:
         if st.button("Start Generation"):
+            with st.spinner("Processing..."):
                 if not st.session_state["api_key"]:
                     st.error("Please provide an API Key.")
                 else:
+                    generator.generate(st.session_state["api_key"])
     if st.session_state.qa_data:
         st.header("Generated Data")
         df = pd.DataFrame(st.session_state.qa_data)
         st.dataframe(df)
         st.download_button("Export CSV", df.to_csv(index=False), "synthetic_data.csv")
+def main() -> None:
     """Main function to run the Streamlit application."""
+    generator = SyntheticDataGenerator()
+    _ = input_sidebar(generator)
+    main_display(generator)
 if __name__ == "__main__":
+    main()