Spaces:

LeonceNsh
/

usgov-contracts-rag

Sleeping

App Files Files Community

LeonceNsh commited on Oct 31, 2024

Commit

d33fe62

verified ·

1 Parent(s): 8760634

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -43

app.py CHANGED Viewed

@@ -1,101 +1,182 @@
 import json
 import gradio as gr
 import duckdb
 # Load the Parquet dataset path
 dataset_path = 'sample_contract_df.parquet'  # Update with your Parquet file path
-# Load the dataset schema with DuckDB
 def load_dataset_schema():
     con = duckdb.connect()
-    con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
-    schema = con.execute("DESCRIBE contract_data").fetchdf()
-    con.close()
-    return schema.to_dict(orient="records")
-# Simple parser to convert natural language to SQL syntax
 def parse_query(nl_query):
-    # Replace common phrases with SQL syntax
-    replacements = {
-        'equal to': '=',
-        'equals to': '=',
-        'equals': '=',
-        'equal': '=',
-        'not equal to': '!=',
-        'not equal': '!=',
-        'greater than or equal to': '>=',
-        'less than or equal to': '<=',
-        'greater than': '>',
-        'less than': '<',
-        'between': 'BETWEEN',
-        ' and ': ' AND ',
-        ' or ': ' OR ',
-    }
-    sql_query = nl_query.lower()
-    for k, v in replacements.items():
-        sql_query = sql_query.replace(k, v)
-    return sql_query
-# Generate SQL based on schema and user query
-def generate_sql_query(features, query):
     condition = parse_query(query)
     sql_query = f"SELECT * FROM contract_data WHERE {condition}"
     return sql_query
-# Execute the SQL query and display results
 def execute_query(sql_query):
     try:
         con = duckdb.connect()
-        con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
         result_df = con.execute(sql_query).fetchdf()
         con.close()
         return result_df, ""
     except Exception as e:
-        con.close()
         return None, f"Error executing query: {e}"
-# Load dataset schema and convert it to JSON for display
-schema = load_dataset_schema()
-schema_json = json.dumps(schema, indent=2)
 # Gradio app UI
 with gr.Blocks() as demo:
     gr.Markdown("""
     # Local Parquet SQL Query App
-    Query and explore data in `sample_contract_df.parquet` using DuckDB and SQL queries.
     ## Instructions
-    - Enter a natural language query describing the data you want to retrieve.
-    - Click "Generate SQL" to see the SQL query that will be executed.
-    - Click "Execute Query" to run the query and see the results.
-    - You can view the dataset schema in the "Dataset Schema" tab.
     """)
     with gr.Tabs():
         with gr.TabItem("Query Data"):
             with gr.Row():
                 with gr.Column(scale=1):
                     query = gr.Textbox(
                         label="Natural Language Query",
-                        placeholder="e.g., 'amount greater than 1000 and status equal to \"approved\"'"
                     )
                     btn_generate = gr.Button("Generate SQL")
                     sql_out = gr.Code(label="Generated SQL Query", language="sql")
                     btn_execute = gr.Button("Execute Query")
                     error_out = gr.Markdown("", visible=False)
                 with gr.Column(scale=2):
-                    results_out = gr.Dataframe(label="Query Results")
         with gr.TabItem("Dataset Schema"):
             gr.Markdown("### Dataset Schema")
-            features = gr.Code(label="Dataset Schema", value=schema_json, language="json")
     # Set up click events
     btn_generate.click(
         fn=generate_sql_query,
-        inputs=[features, query],
         outputs=sql_out,
     )
     btn_execute.click(

 import json
 import gradio as gr
 import duckdb
+import re
+from functools import lru_cache
 # Load the Parquet dataset path
 dataset_path = 'sample_contract_df.parquet'  # Update with your Parquet file path
+# Provided schema
+schema = [
+    {"column_name": "department_ind_agency", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "cgac", "column_type": "BIGINT", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "sub_tier", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "fpds_code", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "office", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "aac_code", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "posteddate", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "type", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "basetype", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "popstreetaddress", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "popcity", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "popstate", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "popzip", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "popcountry", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "active", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "awardnumber", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "awarddate", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "award", "column_type": "DOUBLE", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "awardee", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "state", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "city", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "zipcode", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
+    {"column_name": "countrycode", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None}
+]
+# Cache the schema loading
+@lru_cache(maxsize=1)
+def get_schema():
+    return schema
+# Map column names to their types
+COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
+# Define columns that are numeric
+NUMERIC_COLUMNS = {col['column_name'] for col in get_schema() if col['column_type'] in ['DOUBLE', 'BIGINT', 'INT', 'FLOAT', 'DECIMAL']}
+# Function to load the dataset schema into DuckDB
+@lru_cache(maxsize=1)
 def load_dataset_schema():
     con = duckdb.connect()
+    try:
+        # Drop the view if it exists to avoid errors
+        con.execute("DROP VIEW IF EXISTS contract_data")
+        con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
+        return True
+    except Exception as e:
+        print(f"Error loading dataset schema: {e}")
+        return False
+    finally:
+        con.close()
+# Advanced Natural Language to SQL Parser using schema
 def parse_query(nl_query):
+    """
+    Converts a natural language query into SQL WHERE conditions based on the schema.
+    """
+    # Lowercase the query for uniformity
+    query = nl_query.lower()
+    # Define patterns for comparison
+    patterns = [
+        (r'(\w+)\s+(greater than or equal to|>=)\s+([\d\.]+)', r"\1 >= \3"),
+        (r'(\w+)\s+(less than or equal to|<=)\s+([\d\.]+)', r"\1 <= \3"),
+        (r'(\w+)\s+(greater than|>)\s+([\d\.]+)', r"\1 > \3"),
+        (r'(\w+)\s+(less than|<)\s+([\d\.]+)', r"\1 < \3"),
+        (r'(\w+)\s+(equal to|=|equals to|equals|equal)\s+\"?([\w\s]+)\"?', r'\1 = "\3"'),
+        (r'(\w+)\s+(not equal to|!=|not equal)\s+\"?([\w\s]+)\"?', r'\1 != "\3"'),
+        (r'between\s+([\d\.]+)\s+and\s+([\d\.]+)', r'BETWEEN \1 AND \2'),
+        (r'(\w+)\s+contains\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2%"'),
+        (r'(\w+)\s+starts with\s+\"?([\w\s]+)\"?', r'\1 LIKE "\2%"'),
+        (r'(\w+)\s+ends with\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2"'),
+    ]
+    # Apply each pattern
+    for pattern, replacement in patterns:
+        query = re.sub(pattern, replacement, query)
+    # Handle logical operators
+    query = query.replace(' and ', ' AND ').replace(' or ', ' OR ')
+    # Remove any unintended multiple spaces
+    query = re.sub(r'\s+', ' ', query).strip()
+    return query
+# Generate SQL based on user query
+def generate_sql_query(query):
+    """
+    Generates a SQL query based on the natural language input.
+    """
     condition = parse_query(query)
     sql_query = f"SELECT * FROM contract_data WHERE {condition}"
     return sql_query
+# Execute the SQL query and return results or error
 def execute_query(sql_query):
+    """
+    Executes the SQL query and returns the results as a DataFrame.
+    """
     try:
         con = duckdb.connect()
+        # Ensure the view is created
+        con.execute(f"CREATE OR REPLACE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
         result_df = con.execute(sql_query).fetchdf()
         con.close()
         return result_df, ""
     except Exception as e:
+        # In case of error, return empty dataframe and error message
         return None, f"Error executing query: {e}"
+# Cache the schema JSON for display
+@lru_cache(maxsize=1)
+def get_schema_json():
+    return json.dumps(get_schema(), indent=2)
+# Initialize the dataset schema
+if not load_dataset_schema():
+    raise Exception("Failed to load dataset schema. Please check the dataset path and format.")
 # Gradio app UI
 with gr.Blocks() as demo:
     gr.Markdown("""
     # Local Parquet SQL Query App
+    **Query and explore data** in `sample_contract_df.parquet` using DuckDB and natural language SQL queries.
     ## Instructions
+    1. **Enter a Natural Language Query**: Describe the data you want to retrieve. For example:
+       - `award greater than 1000 and status equal to "approved"`
+       - `department_ind_agency equals "Health and Human Services" or awardee contains "Tech"`
+    2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
+    3. **Execute Query**: Click "Execute Query" to run the query and view the results.
+    4. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
+    ## Example Queries
+    - `award greater than 50000 and state equals "CA"`
+    - `awardee contains "Solutions" or award less than 10000`
+    - `department_ind_agency equals "Defense" and awarddate greater than "2023-01-01"`
     """)
     with gr.Tabs():
+        # Query Tab
         with gr.TabItem("Query Data"):
             with gr.Row():
                 with gr.Column(scale=1):
                     query = gr.Textbox(
                         label="Natural Language Query",
+                        placeholder='e.g., "award greater than 1000 and state equals \\"CA\\""',
+                        lines=4
                     )
                     btn_generate = gr.Button("Generate SQL")
                     sql_out = gr.Code(label="Generated SQL Query", language="sql")
                     btn_execute = gr.Button("Execute Query")
                     error_out = gr.Markdown("", visible=False)
                 with gr.Column(scale=2):
+                    results_out = gr.Dataframe(label="Query Results", interactive=False)
+        # Schema Tab
         with gr.TabItem("Dataset Schema"):
             gr.Markdown("### Dataset Schema")
+            schema_display = gr.JSON(label="Schema", value=json.loads(get_schema_json()))
     # Set up click events
     btn_generate.click(
         fn=generate_sql_query,
+        inputs=query,
         outputs=sql_out,
     )
     btn_execute.click(