Spaces:

mgbam
/

sythenticdata

Sleeping

App Files Files Community

mgbam commited on Feb 9

Commit

c608949

verified ·

1 Parent(s): accacff

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -39

app.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import json
 import requests
 import streamlit as st
 import pdfplumber
 import pandas as pd
 import sqlalchemy
-from typing import Any, Dict, List
 # Provider clients – ensure these libraries are installed
 try:
@@ -17,10 +18,10 @@ try:
 except ImportError:
     groq = None
-# Hugging Face inference endpoint
-HF_API_URL = "https://api-inference.huggingface.co/models/"
-DEFAULT_TEMPERATURE = 0.1
-GROQ_MODEL = "mixtral-8x7b-32768"
 class QADataGenerator:
@@ -32,8 +33,8 @@ class QADataGenerator:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
-        # Updated prompt template with escaped curly braces
-        self.custom_prompt_template = (
             "You are an expert in extracting question and answer pairs from documents. "
             "Generate 3 Q&A pairs from the following data, formatted as a JSON list of dictionaries. "
             "Each dictionary must have keys 'question' and 'answer'. "
@@ -69,7 +70,7 @@ class QADataGenerator:
     def _setup_input_handlers(self) -> None:
         """Register handlers for different input data types."""
-        self.input_handlers: Dict[str, Any] = {
             "text": self.handle_text,
             "pdf": self.handle_pdf,
             "csv": self.handle_csv,
@@ -79,7 +80,7 @@ class QADataGenerator:
     def _initialize_session_state(self) -> None:
         """Initialize Streamlit session state with default configuration."""
-        defaults = {
             "config": {
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
@@ -101,30 +102,31 @@ class QADataGenerator:
     # ----- Input Handlers -----
     def handle_text(self, text: str) -> Dict[str, Any]:
         return {"data": text, "source": "text"}
     def handle_pdf(self, file) -> Dict[str, Any]:
         try:
             with pdfplumber.open(file) as pdf:
-                full_text = ""
-                for page in pdf.pages:
-                    page_text = page.extract_text() or ""
-                    full_text += page_text + "\n"
-                return {"data": full_text, "source": "pdf"}
         except Exception as e:
             self.log_error(f"PDF Processing Error: {e}")
             return {"data": "", "source": "pdf"}
     def handle_csv(self, file) -> Dict[str, Any]:
         try:
             df = pd.read_csv(file)
-            # Convert the DataFrame to a JSON string
-            return {"data": df.to_json(orient="records"), "source": "csv"}
         except Exception as e:
             self.log_error(f"CSV Processing Error: {e}")
             return {"data": "", "source": "csv"}
     def handle_api(self, config: Dict[str, str]) -> Dict[str, Any]:
         try:
             response = requests.get(config["url"], headers=config.get("headers", {}), timeout=10)
             response.raise_for_status()
@@ -134,6 +136,7 @@ class QADataGenerator:
             return {"data": "", "source": "api"}
     def handle_db(self, config: Dict[str, str]) -> Dict[str, Any]:
         try:
             engine = sqlalchemy.create_engine(config["connection"])
             with engine.connect() as conn:
@@ -166,22 +169,22 @@ class QADataGenerator:
         """
         Generate Q&A pairs by sending the built prompt to the selected LLM provider.
         """
-        api_key = st.session_state.api_key
         if not api_key:
             self.log_error("API key is missing!")
             return False
-        provider_name = st.session_state.config["provider"]
-        provider_cfg = self.providers.get(provider_name)
         if not provider_cfg:
             self.log_error(f"Provider {provider_name} is not configured.")
             return False
-        client_initializer = provider_cfg["client"]
         client = client_initializer(api_key)
-        model = st.session_state.config["model"]
-        temperature = st.session_state.config["temperature"]
-        prompt = self.build_prompt()
         st.info(f"Using **{provider_name}** with model **{model}** at temperature **{temperature:.2f}**")
         try:
@@ -238,36 +241,40 @@ class QADataGenerator:
     def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
         Parse the LLM response and return a list of Q&A pairs.
-        Expects the response to be JSON formatted.
         """
         st.write("Parsing response for provider:", provider)
         try:
             if provider == "HuggingFace":
-                # For HuggingFace, assume the generated text is under "generated_text"
                 if isinstance(response, list) and response and "generated_text" in response[0]:
                     raw_text = response[0]["generated_text"]
                 else:
                     self.log_error("Unexpected HuggingFace response format.")
                     return []
             else:
-                # For OpenAI (and similar providers) assume the response is similar to:
-                # response.choices[0].message.content
                 if response and hasattr(response, "choices") and response.choices:
                     raw_text = response.choices[0].message.content
                 else:
                     self.log_error("Unexpected response format from provider.")
                     return []
-            # Try parsing the raw text as JSON
             try:
                 qa_list = json.loads(raw_text)
-                if isinstance(qa_list, list):
-                    return qa_list
-                else:
-                    self.log_error("Parsed output is not a list.")
-                    return []
             except json.JSONDecodeError as e:
-                self.log_error(f"JSON Parsing Error: {e}. Raw output: {raw_text}")
                 return []
         except Exception as e:
             self.log_error(f"Response Parsing Error: {e}")
@@ -276,7 +283,7 @@ class QADataGenerator:
 # ============ UI Components ============
-def config_ui(generator: QADataGenerator):
     """Display configuration options in the sidebar."""
     with st.sidebar:
         st.header("Configuration")
@@ -293,7 +300,7 @@ def config_ui(generator: QADataGenerator):
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
-def input_ui(generator: QADataGenerator):
     """Display input data source options using tabs."""
     st.subheader("Input Data Sources")
     tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])
@@ -339,7 +346,7 @@ def input_ui(generator: QADataGenerator):
             st.session_state.inputs.append(generator.handle_db({"connection": db_conn, "query": db_query}))
             st.success("Database input added!")
-def output_ui(generator: QADataGenerator):
     """Display the generated Q&A pairs and provide a download option."""
     st.subheader("Q&A Pairs Output")
     if st.session_state.qa_pairs:
@@ -354,7 +361,7 @@ def output_ui(generator: QADataGenerator):
     else:
         st.info("No Q&A pairs generated yet.")
-def logs_ui():
     """Display error logs and debugging information in an expandable section."""
     with st.expander("Error Logs & Debug Info", expanded=False):
         if st.session_state.error_logs:
@@ -363,7 +370,8 @@ def logs_ui():
         else:
             st.write("No logs yet.")
-def main():
     st.set_page_config(page_title="Advanced Q&A Synthetic Generator", layout="wide")
     st.title("Advanced Q&A Synthetic Generator")
     st.markdown(

 import json
+import ast
 import requests
 import streamlit as st
 import pdfplumber
 import pandas as pd
 import sqlalchemy
+from typing import Any, Dict, List, Callable
 # Provider clients – ensure these libraries are installed
 try:
 except ImportError:
     groq = None
+# Hugging Face inference endpoint and defaults
+HF_API_URL: str = "https://api-inference.huggingface.co/models/"
+DEFAULT_TEMPERATURE: float = 0.1
+GROQ_MODEL: str = "mixtral-8x7b-32768"
 class QADataGenerator:
         self._setup_providers()
         self._setup_input_handlers()
         self._initialize_session_state()
+        # Updated prompt template with escaped curly braces for literal output
+        self.custom_prompt_template: str = (
             "You are an expert in extracting question and answer pairs from documents. "
             "Generate 3 Q&A pairs from the following data, formatted as a JSON list of dictionaries. "
             "Each dictionary must have keys 'question' and 'answer'. "
     def _setup_input_handlers(self) -> None:
         """Register handlers for different input data types."""
+        self.input_handlers: Dict[str, Callable[[Any], Dict[str, Any]]] = {
             "text": self.handle_text,
             "pdf": self.handle_pdf,
             "csv": self.handle_csv,
     def _initialize_session_state(self) -> None:
         """Initialize Streamlit session state with default configuration."""
+        defaults: Dict[str, Any] = {
             "config": {
                 "provider": "OpenAI",
                 "model": "gpt-4-turbo",
     # ----- Input Handlers -----
     def handle_text(self, text: str) -> Dict[str, Any]:
+        """Process plain text input."""
         return {"data": text, "source": "text"}
     def handle_pdf(self, file) -> Dict[str, Any]:
+        """Extract text from a PDF file."""
         try:
             with pdfplumber.open(file) as pdf:
+                full_text = "\n".join(page.extract_text() or "" for page in pdf.pages)
+            return {"data": full_text, "source": "pdf"}
         except Exception as e:
             self.log_error(f"PDF Processing Error: {e}")
             return {"data": "", "source": "pdf"}
     def handle_csv(self, file) -> Dict[str, Any]:
+        """Process a CSV file by converting it to JSON."""
         try:
             df = pd.read_csv(file)
+            json_data = df.to_json(orient="records")
+            return {"data": json_data, "source": "csv"}
         except Exception as e:
             self.log_error(f"CSV Processing Error: {e}")
             return {"data": "", "source": "csv"}
     def handle_api(self, config: Dict[str, str]) -> Dict[str, Any]:
+        """Fetch data from an API endpoint."""
         try:
             response = requests.get(config["url"], headers=config.get("headers", {}), timeout=10)
             response.raise_for_status()
             return {"data": "", "source": "api"}
     def handle_db(self, config: Dict[str, str]) -> Dict[str, Any]:
+        """Query a database using the provided connection string and SQL query."""
         try:
             engine = sqlalchemy.create_engine(config["connection"])
             with engine.connect() as conn:
         """
         Generate Q&A pairs by sending the built prompt to the selected LLM provider.
         """
+        api_key: str = st.session_state.api_key
         if not api_key:
             self.log_error("API key is missing!")
             return False
+        provider_name: str = st.session_state.config["provider"]
+        provider_cfg: Dict[str, Any] = self.providers.get(provider_name, {})
         if not provider_cfg:
             self.log_error(f"Provider {provider_name} is not configured.")
             return False
+        client_initializer: Callable[[str], Any] = provider_cfg["client"]
         client = client_initializer(api_key)
+        model: str = st.session_state.config["model"]
+        temperature: float = st.session_state.config["temperature"]
+        prompt: str = self.build_prompt()
         st.info(f"Using **{provider_name}** with model **{model}** at temperature **{temperature:.2f}**")
         try:
     def _parse_response(self, response: Any, provider: str) -> List[Dict[str, str]]:
         """
         Parse the LLM response and return a list of Q&A pairs.
+        Expects the response to be JSON formatted; if JSON decoding fails,
+        tries to use ast.literal_eval as a fallback.
         """
         st.write("Parsing response for provider:", provider)
         try:
+            # For non-HuggingFace providers, extract the raw text from the response.
             if provider == "HuggingFace":
                 if isinstance(response, list) and response and "generated_text" in response[0]:
                     raw_text = response[0]["generated_text"]
                 else:
                     self.log_error("Unexpected HuggingFace response format.")
                     return []
             else:
                 if response and hasattr(response, "choices") and response.choices:
                     raw_text = response.choices[0].message.content
                 else:
                     self.log_error("Unexpected response format from provider.")
                     return []
+            # Attempt to parse using json.loads first.
             try:
                 qa_list = json.loads(raw_text)
             except json.JSONDecodeError as e:
+                self.log_error(f"JSON Parsing Error: {e}. Attempting fallback with ast.literal_eval. Raw output: {raw_text}")
+                try:
+                    qa_list = ast.literal_eval(raw_text)
+                except Exception as e2:
+                    self.log_error(f"ast.literal_eval failed: {e2}")
+                    return []
+            if isinstance(qa_list, list):
+                return qa_list
+            else:
+                self.log_error("Parsed output is not a list.")
                 return []
         except Exception as e:
             self.log_error(f"Response Parsing Error: {e}")
 # ============ UI Components ============
+def config_ui(generator: QADataGenerator) -> None:
     """Display configuration options in the sidebar."""
     with st.sidebar:
         st.header("Configuration")
         api_key = st.text_input(f"{provider} API Key", type="password")
         st.session_state.api_key = api_key
+def input_ui(generator: QADataGenerator) -> None:
     """Display input data source options using tabs."""
     st.subheader("Input Data Sources")
     tabs = st.tabs(["Text", "PDF", "CSV", "API", "Database"])
             st.session_state.inputs.append(generator.handle_db({"connection": db_conn, "query": db_query}))
             st.success("Database input added!")
+def output_ui(generator: QADataGenerator) -> None:
     """Display the generated Q&A pairs and provide a download option."""
     st.subheader("Q&A Pairs Output")
     if st.session_state.qa_pairs:
     else:
         st.info("No Q&A pairs generated yet.")
+def logs_ui() -> None:
     """Display error logs and debugging information in an expandable section."""
     with st.expander("Error Logs & Debug Info", expanded=False):
         if st.session_state.error_logs:
         else:
             st.write("No logs yet.")
+def main() -> None:
+    """Main Streamlit application entry point."""
     st.set_page_config(page_title="Advanced Q&A Synthetic Generator", layout="wide")
     st.title("Advanced Q&A Synthetic Generator")
     st.markdown(