Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

mgbam commited on Feb 9

Commit

ee72f5e

verified ·

1 Parent(s): 81c7e29

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -148

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pandas as pd
 import sqlalchemy
 from typing import Any, Dict, List
-# Provider clients (ensure these are installed if you plan to use them)
 try:
     from openai import OpenAI
 except ImportError:
@@ -17,35 +17,37 @@ try:
 except ImportError:
     groq = None
-# Hugging Face Inference API endpoint
 HF_API_URL = "https://api-inference.huggingface.co/models/"
 DEFAULT_TEMPERATURE = 0.1
 GROQ_MODEL = "mixtral-8x7b-32768"
-class AdvancedSyntheticDataGenerator:
     """
-    Advanced Synthetic Data Generator that supports multiple input types,
-    customizable prompt templates, multiple LLM providers, and detailed debugging.
     """
     def __init__(self) -> None:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
-        # Customizable prompt template with placeholders for data, instructions, and output format.
         self.custom_prompt_template = (
-            "You are an expert synthetic data generator. "
-            "Given the data below and following the instructions provided, generate high-quality, diverse synthetic data. "
-            "Ensure the output adheres to the specified format.\n\n"
-            "-------------------------\n"
-            "Data:\n{data}\n\n"
-            "Instructions:\n{instructions}\n\n"
-            "Output Format: {format}\n"
-            "-------------------------\n"
         )
     def _setup_providers(self) -> None:
-        """Configure available LLM providers and their initialization routines."""
         self.providers: Dict[str, Dict[str, Any]] = {
             "Deepseek": {
                 "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key) if OpenAI else None,
@@ -64,7 +66,7 @@ class AdvancedSyntheticDataGenerator:
                 "models": ["gpt2", "llama-2"],
             },
         }
     def _setup_input_handlers(self) -> None:
         """Register handlers for different input data types."""
         self.input_handlers: Dict[str, Any] = {
@@ -74,7 +76,7 @@ class AdvancedSyntheticDataGenerator:
             "api": self.handle_api,
             "db": self.handle_db,
         }
     def _initialize_session_state(self) -> None:
         """Initialize Streamlit session state with default configuration."""
         defaults = {
@@ -82,27 +84,25 @@ class AdvancedSyntheticDataGenerator:
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
                 "temperature": DEFAULT_TEMPERATURE,
-                "output_format": "plain_text",  # Options: plain_text, json, csv
             },
             "api_key": "",
-            "inputs": [],         # List to store all input sources
-            "instructions": "",   # Custom instructions for synthetic data generation
-            "synthetic_data": "", # The generated output
-            "error_logs": [],     # Logs for any errors during processing
         }
         for key, value in defaults.items():
             if key not in st.session_state:
                 st.session_state[key] = value
     def log_error(self, message: str) -> None:
-        """Log an error message both to session state and in the UI."""
         st.session_state.error_logs.append(message)
         st.error(message)
-    # ===== Input Handlers =====
     def handle_text(self, text: str) -> Dict[str, Any]:
         return {"data": text, "source": "text"}
     def handle_pdf(self, file) -> Dict[str, Any]:
         try:
             with pdfplumber.open(file) as pdf:
@@ -114,16 +114,16 @@ class AdvancedSyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"PDF Processing Error: {e}")
             return {"data": "", "source": "pdf"}
     def handle_csv(self, file) -> Dict[str, Any]:
         try:
             df = pd.read_csv(file)
-            # Convert the DataFrame to JSON for simplicity.
             return {"data": df.to_json(orient="records"), "source": "csv"}
         except Exception as e:
             self.log_error(f"CSV Processing Error: {e}")
             return {"data": "", "source": "csv"}
     def handle_api(self, config: Dict[str, str]) -> Dict[str, Any]:
         try:
             response = requests.get(config["url"], headers=config.get("headers", {}), timeout=10)
@@ -132,7 +132,7 @@ class AdvancedSyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"API Processing Error: {e}")
             return {"data": "", "source": "api"}
     def handle_db(self, config: Dict[str, str]) -> Dict[str, Any]:
         try:
             engine = sqlalchemy.create_engine(config["connection"])
@@ -143,7 +143,7 @@ class AdvancedSyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"Database Processing Error: {e}")
             return {"data": "", "source": "db"}
     def aggregate_inputs(self) -> str:
         """Combine all input sources into a single aggregated string."""
         aggregated_data = ""
@@ -151,44 +151,38 @@ class AdvancedSyntheticDataGenerator:
             aggregated_data += f"Source: {item.get('source', 'unknown')}\n"
             aggregated_data += item.get("data", "") + "\n\n"
         return aggregated_data.strip()
     def build_prompt(self) -> str:
         """
-        Build the complete prompt using aggregated data, custom instructions,
-        and the desired output format.
         """
-        aggregated_data = self.aggregate_inputs()
-        instructions = st.session_state.instructions or "Generate diverse, coherent synthetic data."
-        output_format = st.session_state.config.get("output_format", "plain_text")
-        prompt = self.custom_prompt_template.format(
-            data=aggregated_data, instructions=instructions, format=output_format
-        )
         st.write("### Built Prompt")
         st.write(prompt)
         return prompt
-    def generate_synthetic_data(self) -> bool:
         """
-        Generate synthetic data by sending the built prompt to the selected LLM provider.
-        Returns True if generation succeeds.
         """
         api_key = st.session_state.api_key
         if not api_key:
             self.log_error("API key is missing!")
             return False
         provider_name = st.session_state.config["provider"]
         provider_cfg = self.providers.get(provider_name)
         if not provider_cfg:
             self.log_error(f"Provider {provider_name} is not configured.")
             return False
         client_initializer = provider_cfg["client"]
         client = client_initializer(api_key)
         model = st.session_state.config["model"]
         temperature = st.session_state.config["temperature"]
         prompt = self.build_prompt()
         st.info(f"Using **{provider_name}** with model **{model}** at temperature **{temperature:.2f}**")
         try:
             if provider_name == "HuggingFace":
@@ -199,20 +193,18 @@ class AdvancedSyntheticDataGenerator:
             st.write("### Raw API Response")
             st.write(response)
-            synthetic_data = self._parse_response(response, provider_name)
-            st.write("### Parsed Synthetic Data")
-            st.write(synthetic_data)
-            st.session_state.synthetic_data = synthetic_data
             return True
         except Exception as e:
             self.log_error(f"Generation failed: {e}")
             return False
     def _standard_inference(self, client: Any, prompt: str, model: str, temperature: float) -> Any:
-        """
-        Inference for providers using an OpenAI-compatible API.
-        """
         try:
             st.write("Sending prompt via standard inference...")
             result = client.chat.completions.create(
@@ -225,11 +217,9 @@ class AdvancedSyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"Standard Inference Error: {e}")
             return None
     def _huggingface_inference(self, client: Dict[str, Any], prompt: str, model: str) -> Any:
-        """
-        Inference for the Hugging Face Inference API.
-        """
         try:
             st.write("Sending prompt to HuggingFace API...")
             response = requests.post(
@@ -244,62 +234,68 @@ class AdvancedSyntheticDataGenerator:
         except Exception as e:
             self.log_error(f"HuggingFace Inference Error: {e}")
             return None
-    def _parse_response(self, response: Any, provider: str) -> str:
         """
-        Parse the LLM response into a synthetic data string.
         """
         st.write("Parsing response for provider:", provider)
         try:
             if provider == "HuggingFace":
                 if isinstance(response, list) and response and "generated_text" in response[0]:
-                    return response[0]["generated_text"]
                 else:
                     self.log_error("Unexpected HuggingFace response format.")
-                    return ""
             else:
-                # Expecting a structure similar to OpenAI's response.
                 if response and hasattr(response, "choices") and response.choices:
-                    return response.choices[0].message.content
                 else:
                     self.log_error("Unexpected response format from provider.")
-                    return ""
         except Exception as e:
             self.log_error(f"Response Parsing Error: {e}")
-            return ""
-# ===== Advanced UI Components =====
-def advanced_config_ui(generator: AdvancedSyntheticDataGenerator):
-    """Display advanced configuration options in the sidebar."""
     with st.sidebar:
-        st.header("Advanced Configuration")
         provider = st.selectbox("Select Provider", list(generator.providers.keys()))
         st.session_state.config["provider"] = provider
         provider_cfg = generator.providers[provider]
         model = st.selectbox("Select Model", provider_cfg["models"])
         st.session_state.config["model"] = model
         temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
         st.session_state.config["temperature"] = temperature
-        output_format = st.radio("Output Format", ["plain_text", "json", "csv"])
-        st.session_state.config["output_format"] = output_format
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
-        instructions = st.text_area("Custom Instructions",
-                                    "Generate diverse, coherent synthetic data based on the input sources.",
-                                    height=100)
-        st.session_state.instructions = instructions
-def advanced_input_ui(generator: AdvancedSyntheticDataGenerator):
     """Display input data source options using tabs."""
-    st.subheader("Add Input Data")
     tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])
     with tabs[0]:
@@ -343,28 +339,23 @@ def advanced_input_ui(generator: AdvancedSyntheticDataGenerator):
             st.session_state.inputs.append(generator.handle_db({"connection": db_conn, "query": db_query}))
             st.success("Database input added!")
-def advanced_output_ui(generator: AdvancedSyntheticDataGenerator):
-    """Display the generated synthetic data with output options."""
-    st.subheader("Synthetic Data Output")
-    if st.session_state.synthetic_data:
-        output_format = st.session_state.config.get("output_format", "plain_text")
-        if output_format == "json":
-            try:
-                json_output = json.loads(st.session_state.synthetic_data)
-                st.json(json_output)
-            except Exception:
-                st.text_area("Output", st.session_state.synthetic_data, height=300)
-        else:
-            st.text_area("Output", st.session_state.synthetic_data, height=300)
-        st.download_button("Download Output", st.session_state.synthetic_data,
-                           file_name="synthetic_data.txt", mime="text/plain")
     else:
-        st.info("No synthetic data generated yet.")
-def advanced_logs_ui():
-    """Display error logs and debug information in an expandable section."""
     with st.expander("Error Logs & Debug Info", expanded=False):
         if st.session_state.error_logs:
             for log in st.session_state.error_logs:
@@ -373,50 +364,39 @@ def advanced_logs_ui():
             st.write("No logs yet.")
-# ===== Main Application =====
-def main() -> None:
-    st.set_page_config(page_title="Advanced Synthetic Data Generator", layout="wide")
-    # Sidebar for advanced configuration
-    generator = AdvancedSyntheticDataGenerator()
-    advanced_config_ui(generator)
-    st.title("Advanced Synthetic Data Generator")
     st.markdown(
         """
-        Welcome! This application allows you to generate synthetic data from multiple input sources.
-        Use the sections below to add inputs, generate data, view outputs, and review logs.
         """
     )
-    # Input Data Section
-    with st.container():
-        st.header("1. Input Data Sources")
-        advanced_input_ui(generator)
-        if st.button("Clear All Inputs"):
-            st.session_state.inputs = []
-            st.success("All inputs have been cleared!")
-    # Generation Section with a clearly visible button
-    with st.container():
-        st.header("2. Generate Synthetic Data")
-        if st.button("Generate Synthetic Data", key="generate_button"):
-            with st.spinner("Generating synthetic data..."):
-                if generator.generate_synthetic_data():
-                    st.success("Synthetic data generated successfully!")
-                else:
-                    st.error("Data generation failed. Check logs for details.")
-    # Output Section
-    with st.container():
-        st.header("3. Synthetic Data Output")
-        advanced_output_ui(generator)
-    # Logs Section
-    with st.container():
-        st.header("4. Error Logs & Debug Information")
-        advanced_logs_ui()
 if __name__ == "__main__":

 import sqlalchemy
 from typing import Any, Dict, List
+# Provider clients – ensure these libraries are installed
 try:
     from openai import OpenAI
 except ImportError:
 except ImportError:
     groq = None
+# Hugging Face inference endpoint
 HF_API_URL = "https://api-inference.huggingface.co/models/"
 DEFAULT_TEMPERATURE = 0.1
 GROQ_MODEL = "mixtral-8x7b-32768"
+class QADataGenerator:
     """
+    A Q&A Synthetic Generator that extracts and generates question-answer pairs
+    from various input sources using an LLM provider.
     """
     def __init__(self) -> None:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
+        # This prompt instructs the LLM to generate three Q&A pairs.
         self.custom_prompt_template = (
+            "You are an expert in extracting question and answer pairs from documents. "
+            "Generate 3 Q&A pairs from the following data, formatted as a JSON list of dictionaries. "
+            "Each dictionary must have keys 'question' and 'answer'. "
+            "The questions should be clear and concise, and the answers must be based solely on the provided data with no external information. "
+            "Do not hallucinate. \n\n"
+            "Example JSON Output:\n"
+            "[{'question': 'What is the capital of France?', 'answer': 'Paris'}, "
+            "{'question': 'What is the highest mountain in the world?', 'answer': 'Mount Everest'}, "
+            "{'question': 'What is the chemical symbol for gold?', 'answer': 'Au'}]\n\n"
+            "Now, generate 3 Q&A pairs from this data:\n{data}"
         )
     def _setup_providers(self) -> None:
+        """Configure available LLM providers and their client initialization routines."""
         self.providers: Dict[str, Dict[str, Any]] = {
             "Deepseek": {
                 "client": lambda key: OpenAI(base_url="https://api.deepseek.com/v1", api_key=key) if OpenAI else None,
                 "models": ["gpt2", "llama-2"],
             },
         }
     def _setup_input_handlers(self) -> None:
         """Register handlers for different input data types."""
         self.input_handlers: Dict[str, Any] = {
             "api": self.handle_api,
             "db": self.handle_db,
         }
     def _initialize_session_state(self) -> None:
         """Initialize Streamlit session state with default configuration."""
         defaults = {
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
                 "temperature": DEFAULT_TEMPERATURE,
             },
             "api_key": "",
+            "inputs": [],       # List to store input sources
+            "qa_pairs": "",     # Generated Q&A pairs output
+            "error_logs": [],   # To store any error messages
         }
         for key, value in defaults.items():
             if key not in st.session_state:
                 st.session_state[key] = value
     def log_error(self, message: str) -> None:
+        """Log an error message to session state and display it."""
         st.session_state.error_logs.append(message)
         st.error(message)
+    # ----- Input Handlers -----
     def handle_text(self, text: str) -> Dict[str, Any]:
         return {"data": text, "source": "text"}
     def handle_pdf(self, file) -> Dict[str, Any]:
         try:
             with pdfplumber.open(file) as pdf:
         except Exception as e:
             self.log_error(f"PDF Processing Error: {e}")
             return {"data": "", "source": "pdf"}
     def handle_csv(self, file) -> Dict[str, Any]:
         try:
             df = pd.read_csv(file)
+            # Convert the DataFrame to a JSON string
             return {"data": df.to_json(orient="records"), "source": "csv"}
         except Exception as e:
             self.log_error(f"CSV Processing Error: {e}")
             return {"data": "", "source": "csv"}
     def handle_api(self, config: Dict[str, str]) -> Dict[str, Any]:
         try:
             response = requests.get(config["url"], headers=config.get("headers", {}), timeout=10)
         except Exception as e:
             self.log_error(f"API Processing Error: {e}")
             return {"data": "", "source": "api"}
     def handle_db(self, config: Dict[str, str]) -> Dict[str, Any]:
         try:
             engine = sqlalchemy.create_engine(config["connection"])
         except Exception as e:
             self.log_error(f"Database Processing Error: {e}")
             return {"data": "", "source": "db"}
     def aggregate_inputs(self) -> str:
         """Combine all input sources into a single aggregated string."""
         aggregated_data = ""
             aggregated_data += f"Source: {item.get('source', 'unknown')}\n"
             aggregated_data += item.get("data", "") + "\n\n"
         return aggregated_data.strip()
     def build_prompt(self) -> str:
         """
+        Build the complete prompt using the custom template and aggregated inputs.
         """
+        data = self.aggregate_inputs()
+        prompt = self.custom_prompt_template.format(data=data)
         st.write("### Built Prompt")
         st.write(prompt)
         return prompt
+    def generate_qa_pairs(self) -> bool:
         """
+        Generate Q&A pairs by sending the built prompt to the selected LLM provider.
         """
         api_key = st.session_state.api_key
         if not api_key:
             self.log_error("API key is missing!")
             return False
         provider_name = st.session_state.config["provider"]
         provider_cfg = self.providers.get(provider_name)
         if not provider_cfg:
             self.log_error(f"Provider {provider_name} is not configured.")
             return False
         client_initializer = provider_cfg["client"]
         client = client_initializer(api_key)
         model = st.session_state.config["model"]
         temperature = st.session_state.config["temperature"]
         prompt = self.build_prompt()
         st.info(f"Using **{provider_name}** with model **{model}** at temperature **{temperature:.2f}**")
         try:
             if provider_name == "HuggingFace":
             st.write("### Raw API Response")
             st.write(response)
+            qa_pairs = self._parse_response(response, provider_name)
+            st.write("### Parsed Q&A Pairs")
+            st.write(qa_pairs)
+            st.session_state.qa_pairs = qa_pairs
             return True
         except Exception as e:
             self.log_error(f"Generation failed: {e}")
             return False
     def _standard_inference(self, client: Any, prompt: str, model: str, temperature: float) -> Any:
+        """Inference method for providers using an OpenAI-compatible API."""
         try:
             st.write("Sending prompt via standard inference...")
             result = client.chat.completions.create(
         except Exception as e:
             self.log_error(f"Standard Inference Error: {e}")
             return None
     def _huggingface_inference(self, client: Dict[str, Any], prompt: str, model: str) -> Any:
+        """Inference method for the Hugging Face Inference API."""
         try:
             st.write("Sending prompt to HuggingFace API...")
             response = requests.post(
         except Exception as e:
             self.log_error(f"HuggingFace Inference Error: {e}")
             return None
+    def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
+        Parse the LLM response and return a list of Q&A pairs.
+        Expects the response to be JSON formatted.
         """
         st.write("Parsing response for provider:", provider)
         try:
             if provider == "HuggingFace":
+                # For HuggingFace, assume the generated text is under "generated_text"
                 if isinstance(response, list) and response and "generated_text" in response[0]:
+                    raw_text = response[0]["generated_text"]
                 else:
                     self.log_error("Unexpected HuggingFace response format.")
+                    return []
             else:
+                # For OpenAI (and similar providers) assume the response is similar to:
+                # response.choices[0].message.content
                 if response and hasattr(response, "choices") and response.choices:
+                    raw_text = response.choices[0].message.content
                 else:
                     self.log_error("Unexpected response format from provider.")
+                    return []
+            # Try parsing the raw text as JSON
+            try:
+                qa_list = json.loads(raw_text)
+                if isinstance(qa_list, list):
+                    return qa_list
+                else:
+                    self.log_error("Parsed output is not a list.")
+                    return []
+            except json.JSONDecodeError as e:
+                self.log_error(f"JSON Parsing Error: {e}. Raw output: {raw_text}")
+                return []
         except Exception as e:
             self.log_error(f"Response Parsing Error: {e}")
+            return []
+# ============ UI Components ============
+def config_ui(generator: QADataGenerator):
+    """Display configuration options in the sidebar."""
     with st.sidebar:
+        st.header("Configuration")
         provider = st.selectbox("Select Provider", list(generator.providers.keys()))
         st.session_state.config["provider"] = provider
         provider_cfg = generator.providers[provider]
         model = st.selectbox("Select Model", provider_cfg["models"])
         st.session_state.config["model"] = model
         temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
         st.session_state.config["temperature"] = temperature
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
+def input_ui(generator: QADataGenerator):
     """Display input data source options using tabs."""
+    st.subheader("Input Data Sources")
     tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])
     with tabs[0]:
             st.session_state.inputs.append(generator.handle_db({"connection": db_conn, "query": db_query}))
             st.success("Database input added!")
+def output_ui(generator: QADataGenerator):
+    """Display the generated Q&A pairs and provide a download option."""
+    st.subheader("Q&A Pairs Output")
+    if st.session_state.qa_pairs:
+        st.write("### Generated Q&A Pairs")
+        st.write(st.session_state.qa_pairs)
+        st.download_button(
+            "Download Output",
+            json.dumps(st.session_state.qa_pairs, indent=2),
+            file_name="qa_pairs.json",
+            mime="application/json"
+        )
     else:
+        st.info("No Q&A pairs generated yet.")
+def logs_ui():
+    """Display error logs and debugging information in an expandable section."""
     with st.expander("Error Logs & Debug Info", expanded=False):
         if st.session_state.error_logs:
             for log in st.session_state.error_logs:
             st.write("No logs yet.")
+def main():
+    st.set_page_config(page_title="Advanced Q&A Synthetic Generator", layout="wide")
+    st.title("Advanced Q&A Synthetic Generator")
     st.markdown(
         """
+        Welcome to the Advanced Q&A Synthetic Generator. This tool extracts and generates question-answer pairs
+        from various input sources. Configure your provider in the sidebar, add input data, and click the button below to generate Q&A pairs.
         """
     )
+    # Initialize generator and display configuration UI
+    generator = QADataGenerator()
+    config_ui(generator)
+    st.header("1. Input Data")
+    input_ui(generator)
+    if st.button("Clear All Inputs"):
+        st.session_state.inputs = []
+        st.success("All inputs have been cleared!")
+    st.header("2. Generate Q&A Pairs")
+    if st.button("Generate Q&A Pairs", key="generate_qa"):
+        with st.spinner("Generating Q&A pairs..."):
+            if generator.generate_qa_pairs():
+                st.success("Q&A pairs generated successfully!")
+            else:
+                st.error("Q&A generation failed. Check logs for details.")
+    st.header("3. Output")
+    output_ui(generator)
+    st.header("4. Logs & Debug Information")
+    logs_ui()
 if __name__ == "__main__":