Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

mgbam commited on Feb 9

Commit

945d7f4

verified ·

1 Parent(s): 1148bbc

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -18

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import json
 import requests
 import streamlit as st
@@ -32,19 +33,20 @@ class QADataGenerator:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
-        # This prompt instructs the LLM to generate three Q&A pairs.
-        # Note: Literal curly braces in the example are escaped with double braces.
         self.custom_prompt_template = (
             "You are an expert in extracting question and answer pairs from documents. "
-            "Generate 3 Q&A pairs from the following data, formatted as a JSON list of dictionaries. "
             "Each dictionary must have keys 'question' and 'answer'. "
             "The questions should be clear and concise, and the answers must be based solely on the provided data with no external information. "
             "Do not hallucinate.\n\n"
-            "Example JSON Output:\n"
-            "[{{'question': 'What is the capital of France?', 'answer': 'Paris'}}, "
-            "{{'question': 'What is the highest mountain in the world?', 'answer': 'Mount Everest'}}, "
-            "{{'question': 'What is the chemical symbol for gold?', 'answer': 'Au'}}]\n\n"
-            "Now, generate 3 Q&A pairs from this data:\n{data}"
         )
     def _setup_providers(self) -> None:
@@ -85,6 +87,7 @@ class QADataGenerator:
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
                 "temperature": DEFAULT_TEMPERATURE,
             },
             "api_key": "",
             "inputs": [],       # List to store input sources
@@ -156,9 +159,11 @@ class QADataGenerator:
     def build_prompt(self) -> str:
         """
         Build the complete prompt using the custom template and aggregated inputs.
         """
         data = self.aggregate_inputs()
-        prompt = self.custom_prompt_template.format(data=data)
         st.write("### Built Prompt")
         st.write(prompt)
         return prompt
@@ -239,27 +244,25 @@ class QADataGenerator:
     def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
         Parse the LLM response and return a list of Q&A pairs.
-        Expects the response to be JSON formatted.
         """
         st.write("Parsing response for provider:", provider)
         try:
             if provider == "HuggingFace":
-                # For HuggingFace, assume the generated text is under "generated_text"
                 if isinstance(response, list) and response and "generated_text" in response[0]:
                     raw_text = response[0]["generated_text"]
                 else:
                     self.log_error("Unexpected HuggingFace response format.")
                     return []
             else:
-                # For OpenAI (and similar providers), assume the response is similar to:
-                # response.choices[0].message.content
                 if response and hasattr(response, "choices") and response.choices:
                     raw_text = response.choices[0].message.content
                 else:
                     self.log_error("Unexpected response format from provider.")
                     return []
-            # Try parsing the raw text as JSON
             try:
                 qa_list = json.loads(raw_text)
                 if isinstance(qa_list, list):
@@ -267,9 +270,18 @@ class QADataGenerator:
                 else:
                     self.log_error("Parsed output is not a list.")
                     return []
-            except json.JSONDecodeError as e:
-                self.log_error(f"JSON Parsing Error: {e}. Raw output: {raw_text}")
-                return []
         except Exception as e:
             self.log_error(f"Response Parsing Error: {e}")
             return []
@@ -291,6 +303,10 @@ def config_ui(generator: QADataGenerator):
         temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
         st.session_state.config["temperature"] = temperature
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
@@ -401,4 +417,4 @@ def main():
 if __name__ == "__main__":
-    main()

+import ast
 import json
 import requests
 import streamlit as st
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
+        # This prompt instructs the LLM to generate a configurable number of Q&A pairs.
+        # Note: Literal curly braces for the example are escaped with double braces.
         self.custom_prompt_template = (
             "You are an expert in extracting question and answer pairs from documents. "
+            "Generate {num_pairs} Q&A pairs from the following data, formatted as a JSON list of dictionaries. "
             "Each dictionary must have keys 'question' and 'answer'. "
             "The questions should be clear and concise, and the answers must be based solely on the provided data with no external information. "
             "Do not hallucinate.\n\n"
+            "Example JSON Output for {num_pairs} pairs:\n"
+            "[{{'question': 'Example question 1', 'answer': 'Example answer 1'}}, "
+            "{{'question': 'Example question 2', 'answer': 'Example answer 2'}}, "
+            "..., "
+            "{{'question': 'Example question {num_pairs}', 'answer': 'Example answer {num_pairs}'}}]\n\n"
+            "Now, generate {num_pairs} Q&A pairs from this data:\n{data}"
         )
     def _setup_providers(self) -> None:
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
                 "temperature": DEFAULT_TEMPERATURE,
+                "num_pairs": 3,  # Default to 3 Q&A pairs
             },
             "api_key": "",
             "inputs": [],       # List to store input sources
     def build_prompt(self) -> str:
         """
         Build the complete prompt using the custom template and aggregated inputs.
+        The number of Q&A pairs is inserted via the {num_pairs} placeholder.
         """
         data = self.aggregate_inputs()
+        num_pairs = st.session_state.config.get("num_pairs", 3)
+        prompt = self.custom_prompt_template.format(data=data, num_pairs=num_pairs)
         st.write("### Built Prompt")
         st.write(prompt)
         return prompt
     def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
         Parse the LLM response and return a list of Q&A pairs.
+        Expects the response to be in a JSON-like format.
+        If JSON parsing fails (e.g. due to single quotes), fall back to ast.literal_eval.
         """
         st.write("Parsing response for provider:", provider)
         try:
             if provider == "HuggingFace":
                 if isinstance(response, list) and response and "generated_text" in response[0]:
                     raw_text = response[0]["generated_text"]
                 else:
                     self.log_error("Unexpected HuggingFace response format.")
                     return []
             else:
                 if response and hasattr(response, "choices") and response.choices:
                     raw_text = response.choices[0].message.content
                 else:
                     self.log_error("Unexpected response format from provider.")
                     return []
+            # Try parsing as JSON first
             try:
                 qa_list = json.loads(raw_text)
                 if isinstance(qa_list, list):
                 else:
                     self.log_error("Parsed output is not a list.")
                     return []
+            except json.JSONDecodeError:
+                st.write("Standard JSON parsing failed. Falling back to ast.literal_eval...")
+                try:
+                    qa_list = ast.literal_eval(raw_text)
+                    if isinstance(qa_list, list):
+                        return qa_list
+                    else:
+                        self.log_error("Parsed output using ast.literal_eval is not a list.")
+                        return []
+                except Exception as e:
+                    self.log_error(f"ast.literal_eval parsing error: {e}. Raw output: {raw_text}")
+                    return []
         except Exception as e:
             self.log_error(f"Response Parsing Error: {e}")
             return []
         temperature = st.slider("Temperature", 0.0, 1.0, DEFAULT_TEMPERATURE)
         st.session_state.config["temperature"] = temperature
+        # New: Number of Q&A pairs
+        num_pairs = st.number_input("Number of Q&A Pairs", min_value=1, max_value=20, value=3, step=1)
+        st.session_state.config["num_pairs"] = num_pairs
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
 if __name__ == "__main__":
+    main()