Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

mgbam commited on Feb 9

Commit

a4956fa

verified ·

1 Parent(s): f1d2989

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -56

app.py CHANGED Viewed

@@ -24,27 +24,28 @@ DEFAULT_TEMPERATURE: float = 0.1
 GROQ_MODEL: str = "mixtral-8x7b-32768"
-class QADataGenerator:
     """
-    A Q&A Synthetic Generator that extracts and generates question-answer pairs
-    from various input sources using an LLM provider.
     """
     def __init__(self) -> None:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
-        # Prompt template with a dynamic {num_examples} parameter and escaped curly braces
         self.custom_prompt_template: str = (
-            "You are an expert in extracting question and answer pairs from documents. "
-            "Generate {num_examples} Q&A pairs from the following data, formatted as a JSON list of dictionaries. "
-            "Each dictionary must have keys 'question' and 'answer'. "
-            "The questions should be clear and concise, and the answers must be based solely on the provided data with no external information. "
-            "Do not hallucinate. \n\n"
             "Example JSON Output:\n"
-            "[{{'question': 'What is the capital of France?', 'answer': 'Paris'}}, "
-            "{{'question': 'What is the highest mountain in the world?', 'answer': 'Mount Everest'}}, "
-            "{{'question': 'What is the chemical symbol for gold?', 'answer': 'Au'}}]\n\n"
-            "Now, generate {num_examples} Q&A pairs from this data:\n{data}"
         )
     def _setup_providers(self) -> None:
@@ -85,12 +86,12 @@ class QADataGenerator:
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
                 "temperature": DEFAULT_TEMPERATURE,
-                "num_examples": 3,  # Default number of Q&A pairs
             },
             "api_key": "",
-            "inputs": [],       # List to store input sources
-            "qa_pairs": None,   # Generated Q&A pairs output
-            "error_logs": [],   # To store error messages
         }
         for key, value in defaults.items():
             if key not in st.session_state:
@@ -185,9 +186,9 @@ class QADataGenerator:
         st.write(prompt)
         return prompt
-    def generate_qa_pairs(self) -> bool:
         """
-        Generate Q&A pairs by sending the built prompt to the selected LLM provider.
         """
         api_key: str = st.session_state.api_key
         if not api_key:
@@ -216,11 +217,11 @@ class QADataGenerator:
             st.write("### Raw API Response")
             st.write(response)
-            qa_pairs = self._parse_response(response, provider_name)
-            st.write("### Parsed Q&A Pairs")
-            st.write(qa_pairs)
-            st.session_state.qa_pairs = qa_pairs
             return True
         except Exception as e:
             self.log_error(f"Generation failed: {e}")
@@ -260,7 +261,7 @@ class QADataGenerator:
     def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
-        Parse the LLM response and return a list of Q&A pairs.
         Expects the response to be JSON formatted; if JSON decoding fails,
         uses ast.literal_eval as a fallback.
         """
@@ -280,17 +281,17 @@ class QADataGenerator:
                     return []
             try:
-                qa_list = json.loads(raw_text)
             except json.JSONDecodeError as e:
                 self.log_error(f"JSON Parsing Error: {e}. Attempting fallback with ast.literal_eval. Raw output: {raw_text}")
                 try:
-                    qa_list = ast.literal_eval(raw_text)
                 except Exception as e2:
                     self.log_error(f"ast.literal_eval failed: {e2}")
                     return []
-            if isinstance(qa_list, list):
-                return qa_list
             else:
                 self.log_error("Parsed output is not a list.")
                 return []
@@ -301,11 +302,11 @@ class QADataGenerator:
 # ============ UI Components ============
-def config_ui(generator: QADataGenerator) -> None:
     """Display configuration options in the sidebar and update URL query parameters."""
     with st.sidebar:
         st.header("Configuration")
-        # Retrieve any query parameters from the URL
         params = st.experimental_get_query_params()
         default_provider = params.get("provider", ["OpenAI"])[0]
         default_model = params.get("model", ["gpt-4-turbo"])[0]
@@ -326,22 +327,22 @@ def config_ui(generator: QADataGenerator) -> None:
         temperature = st.slider("Temperature", 0.0, 1.0, default_temperature)
         st.session_state.config["temperature"] = temperature
-        num_examples = st.number_input("Number of Q&A Pairs", min_value=1, max_value=10,
                                        value=default_num_examples, step=1)
         st.session_state.config["num_examples"] = num_examples
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
-        # Update the URL query parameters for sharing/pre-populating configuration
-        st.experimental_set_query_params(
             provider=st.session_state.config["provider"],
             model=st.session_state.config["model"],
             temperature=st.session_state.config["temperature"],
             num_examples=st.session_state.config["num_examples"],
         )
-def input_ui(generator: QADataGenerator) -> None:
     """Display input data source options using tabs."""
     st.subheader("Input Data Sources")
     tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])
@@ -387,35 +388,35 @@ def input_ui(generator: QADataGenerator) -> None:
             st.session_state.inputs.append(generator.handle_db({"connection": db_conn, "query": db_query}))
             st.success("Database input added!")
-def output_ui(generator: QADataGenerator) -> None:
-    """Display the generated Q&A pairs and provide download options (JSON and CSV)."""
-    st.subheader("Q&A Pairs Output")
-    if st.session_state.qa_pairs:
-        st.write("### Generated Q&A Pairs")
-        st.write(st.session_state.qa_pairs)
         # Download as JSON
         st.download_button(
             "Download as JSON",
-            json.dumps(st.session_state.qa_pairs, indent=2),
-            file_name="qa_pairs.json",
             mime="application/json"
         )
         # Download as CSV
         try:
-            df = pd.DataFrame(st.session_state.qa_pairs)
             csv_data = df.to_csv(index=False)
             st.download_button(
                 "Download as CSV",
                 csv_data,
-                file_name="qa_pairs.csv",
                 mime="text/csv"
             )
         except Exception as e:
             st.error(f"Error generating CSV: {e}")
     else:
-        st.info("No Q&A pairs generated yet.")
 def logs_ui() -> None:
     """Display error logs and debugging information in an expandable section."""
@@ -428,17 +429,18 @@ def logs_ui() -> None:
 def main() -> None:
     """Main Streamlit application entry point."""
-    st.set_page_config(page_title="Advanced Q&A Synthetic Generator", layout="wide")
-    st.title("Advanced Q&A Synthetic Generator")
     st.markdown(
         """
-        Welcome to the Advanced Q&A Synthetic Generator. This tool extracts and generates question-answer pairs
-        from various input sources. Configure your provider in the sidebar, add input data, and click the button below to generate Q&A pairs.
         """
     )
     # Initialize generator and display configuration UI
-    generator = QADataGenerator()
     config_ui(generator)
     st.header("1. Input Data")
@@ -447,13 +449,13 @@ def main() -> None:
         st.session_state.inputs = []
         st.success("All inputs have been cleared!")
-    st.header("2. Generate Q&A Pairs")
-    if st.button("Generate Q&A Pairs", key="generate_qa"):
-        with st.spinner("Generating Q&A pairs..."):
-            if generator.generate_qa_pairs():
-                st.success("Q&A pairs generated successfully!")
             else:
-                st.error("Q&A generation failed. Check logs for details.")
     st.header("3. Output")
     output_ui(generator)

 GROQ_MODEL: str = "mixtral-8x7b-32768"
+class SyntheticDataGenerator:
     """
+    An advanced Synthetic Data Generator for creating training examples for fine-tuning.
+    The generator accepts various input sources and then uses an LLM provider to create
+    synthetic examples in JSON format. Each example is a dictionary with 'input' and 'output' keys.
     """
     def __init__(self) -> None:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
+        # Prompt template with dynamic {num_examples} parameter and escaped curly braces.
         self.custom_prompt_template: str = (
+            "You are an expert in generating synthetic training data for fine-tuning. "
+            "Generate {num_examples} training examples from the following data, formatted as a JSON list of dictionaries. "
+            "Each dictionary must have keys 'input' and 'output'. "
+            "The examples should be clear, diverse, and based solely on the provided data. "
+            "Do not add any external information. \n\n"
             "Example JSON Output:\n"
+            "[{{'input': 'sample input text 1', 'output': 'sample output text 1'}}, "
+            "{{'input': 'sample input text 2', 'output': 'sample output text 2'}}]\n\n"
+            "Now, generate {num_examples} training examples from this data:\n{data}"
         )
     def _setup_providers(self) -> None:
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
                 "temperature": DEFAULT_TEMPERATURE,
+                "num_examples": 3,  # Default number of synthetic examples
             },
             "api_key": "",
+            "inputs": [],         # List to store input sources
+            "synthetic_data": None,  # Generated synthetic data output
+            "error_logs": [],     # To store error messages
         }
         for key, value in defaults.items():
             if key not in st.session_state:
         st.write(prompt)
         return prompt
+    def generate_synthetic_data(self) -> bool:
         """
+        Generate synthetic training examples by sending the built prompt to the selected LLM provider.
         """
         api_key: str = st.session_state.api_key
         if not api_key:
             st.write("### Raw API Response")
             st.write(response)
+            synthetic_examples = self._parse_response(response, provider_name)
+            st.write("### Parsed Synthetic Data")
+            st.write(synthetic_examples)
+            st.session_state.synthetic_data = synthetic_examples
             return True
         except Exception as e:
             self.log_error(f"Generation failed: {e}")
     def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
+        Parse the LLM response and return a list of synthetic training examples.
         Expects the response to be JSON formatted; if JSON decoding fails,
         uses ast.literal_eval as a fallback.
         """
                     return []
             try:
+                examples = json.loads(raw_text)
             except json.JSONDecodeError as e:
                 self.log_error(f"JSON Parsing Error: {e}. Attempting fallback with ast.literal_eval. Raw output: {raw_text}")
                 try:
+                    examples = ast.literal_eval(raw_text)
                 except Exception as e2:
                     self.log_error(f"ast.literal_eval failed: {e2}")
                     return []
+            if isinstance(examples, list):
+                return examples
             else:
                 self.log_error("Parsed output is not a list.")
                 return []
 # ============ UI Components ============
+def config_ui(generator: SyntheticDataGenerator) -> None:
     """Display configuration options in the sidebar and update URL query parameters."""
     with st.sidebar:
         st.header("Configuration")
+        # Retrieve query parameters (if any)
         params = st.experimental_get_query_params()
         default_provider = params.get("provider", ["OpenAI"])[0]
         default_model = params.get("model", ["gpt-4-turbo"])[0]
         temperature = st.slider("Temperature", 0.0, 1.0, default_temperature)
         st.session_state.config["temperature"] = temperature
+        num_examples = st.number_input("Number of Training Examples", min_value=1, max_value=10,
                                        value=default_num_examples, step=1)
         st.session_state.config["num_examples"] = num_examples
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
+        # Update URL query parameters using the new API (st.set_query_params)
+        st.set_query_params(
             provider=st.session_state.config["provider"],
             model=st.session_state.config["model"],
             temperature=st.session_state.config["temperature"],
             num_examples=st.session_state.config["num_examples"],
         )
+def input_ui(generator: SyntheticDataGenerator) -> None:
     """Display input data source options using tabs."""
     st.subheader("Input Data Sources")
     tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])
             st.session_state.inputs.append(generator.handle_db({"connection": db_conn, "query": db_query}))
             st.success("Database input added!")
+def output_ui(generator: SyntheticDataGenerator) -> None:
+    """Display the generated synthetic data and provide download options (JSON and CSV)."""
+    st.subheader("Synthetic Data Output")
+    if st.session_state.synthetic_data:
+        st.write("### Generated Training Examples")
+        st.write(st.session_state.synthetic_data)
         # Download as JSON
         st.download_button(
             "Download as JSON",
+            json.dumps(st.session_state.synthetic_data, indent=2),
+            file_name="synthetic_data.json",
             mime="application/json"
         )
         # Download as CSV
         try:
+            df = pd.DataFrame(st.session_state.synthetic_data)
             csv_data = df.to_csv(index=False)
             st.download_button(
                 "Download as CSV",
                 csv_data,
+                file_name="synthetic_data.csv",
                 mime="text/csv"
             )
         except Exception as e:
             st.error(f"Error generating CSV: {e}")
     else:
+        st.info("No synthetic data generated yet.")
 def logs_ui() -> None:
     """Display error logs and debugging information in an expandable section."""
 def main() -> None:
     """Main Streamlit application entry point."""
+    st.set_page_config(page_title="Advanced Synthetic Data Generator", layout="wide")
+    st.title("Advanced Synthetic Data Generator")
     st.markdown(
         """
+        Welcome to the Advanced Synthetic Data Generator. This tool creates synthetic training examples
+        for fine-tuning models. Configure your provider in the sidebar, add input data, and click the button
+        below to generate synthetic data.
         """
     )
     # Initialize generator and display configuration UI
+    generator = SyntheticDataGenerator()
     config_ui(generator)
     st.header("1. Input Data")
         st.session_state.inputs = []
         st.success("All inputs have been cleared!")
+    st.header("2. Generate Synthetic Data")
+    if st.button("Generate Synthetic Data", key="generate_data"):
+        with st.spinner("Generating synthetic data..."):
+            if generator.generate_synthetic_data():
+                st.success("Synthetic data generated successfully!")
             else:
+                st.error("Data generation failed. Check logs for details.")
     st.header("3. Output")
     output_ui(generator)