Spaces:

LeonceNsh
/

usgov-contracts-rag

Sleeping

App Files Files Community

LeonceNsh commited on Oct 31, 2024

Commit

7012184

verified ·

1 Parent(s): 1f06762

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -43

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 import duckdb
 import re
 from functools import lru_cache
 # Load the Parquet dataset path
 dataset_path = 'sample_contract_df.parquet'  # Update with your Parquet file path
@@ -42,9 +43,6 @@ def get_schema():
 # Map column names to their types
 COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
-# Define columns that are numeric
-NUMERIC_COLUMNS = {col['column_name'] for col in get_schema() if col['column_type'] in ['DOUBLE', 'BIGINT', 'INT', 'FLOAT', 'DECIMAL']}
 # Function to load the dataset schema into DuckDB
 @lru_cache(maxsize=1)
 def load_dataset_schema():
@@ -60,39 +58,74 @@ def load_dataset_schema():
     finally:
         con.close()
-# Advanced Natural Language to SQL Parser using schema
 def parse_query(nl_query):
     """
     Converts a natural language query into SQL WHERE conditions based on the schema.
     """
-    # Lowercase the query for uniformity
     query = nl_query.lower()
-    # Define patterns for comparison
-    patterns = [
-        (r'(\w+)\s+(greater than or equal to|>=)\s+([\d\.]+)', r"\1 >= \3"),
-        (r'(\w+)\s+(less than or equal to|<=)\s+([\d\.]+)', r"\1 <= \3"),
-        (r'(\w+)\s+(greater than|>)\s+([\d\.]+)', r"\1 > \3"),
-        (r'(\w+)\s+(less than|<)\s+([\d\.]+)', r"\1 < \3"),
-        (r'(\w+)\s+(equal to|=|equals to|equals|equal)\s+\"?([\w\s]+)\"?', r'\1 = "\3"'),
-        (r'(\w+)\s+(not equal to|!=|not equal)\s+\"?([\w\s]+)\"?', r'\1 != "\3"'),
-        (r'between\s+([\d\.]+)\s+and\s+([\d\.]+)', r'BETWEEN \1 AND \2'),
-        (r'(\w+)\s+contains\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2%"'),
-        (r'(\w+)\s+starts with\s+\"?([\w\s]+)\"?', r'\1 LIKE "\2%"'),
-        (r'(\w+)\s+ends with\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2"'),
-    ]
-    # Apply each pattern
-    for pattern, replacement in patterns:
-        query = re.sub(pattern, replacement, query)
-    # Handle logical operators
-    query = query.replace(' and ', ' AND ').replace(' or ', ' OR ')
-    # Remove any unintended multiple spaces
-    query = re.sub(r'\s+', ' ', query).strip()
-    return query
 # Generate SQL based on user query
 def generate_sql_query(query):
@@ -100,7 +133,10 @@ def generate_sql_query(query):
     Generates a SQL query based on the natural language input.
     """
     condition = parse_query(query)
-    sql_query = f"SELECT * FROM contract_data WHERE {condition}"
     return sql_query
 # Execute the SQL query and return results or error
@@ -138,23 +174,20 @@ with gr.Blocks() as demo:
     ## Instructions
     1. **Describe the data you want to retrieve**: For example:
-       - `award greater than 1000000`
-       - `department_ind_agency equals "Health and Human Services"`
-       - What is the total value of awards granted by each state in the year 2022?
-       - Which awardees received multiple awards, and what are the corresponding award amounts?
-       - How many awards were issued by each department division, such as the Department of Defense or Veterans Affairs?
-       - What is the distribution of awards by city and zip code across different countries?
-       - Which awards are currently active, and what are their respective award numbers and dates?
     2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
     3. **Execute Query**: Click "Execute Query" to run the query and view the results.
     4. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
     ## Example Queries
-    - `award > 720000000`
-    - `award greater than 10000000`
     """)
     with gr.Tabs():
@@ -164,7 +197,7 @@ with gr.Blocks() as demo:
                 with gr.Column(scale=1):
                     query = gr.Textbox(
                         label="Natural Language Query",
-                        placeholder='e.g., "award greater than 1000 and state equals \\"CA\\""',
                         lines=4
                     )
                     btn_generate = gr.Button("Generate SQL")

 import duckdb
 import re
 from functools import lru_cache
+from transformers import pipeline
 # Load the Parquet dataset path
 dataset_path = 'sample_contract_df.parquet'  # Update with your Parquet file path
 # Map column names to their types
 COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
 # Function to load the dataset schema into DuckDB
 @lru_cache(maxsize=1)
 def load_dataset_schema():
     finally:
         con.close()
+# Initialize the NLP model for query parsing
+@lru_cache(maxsize=1)
+def get_nlp_model():
+    # We use a zero-shot-classification pipeline for query intent understanding
+    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
+    return classifier
+# Advanced Natural Language to SQL Parser using NLP
 def parse_query(nl_query):
     """
     Converts a natural language query into SQL WHERE conditions based on the schema.
     """
+    # Tokenize and normalize the query
     query = nl_query.lower()
+    # Identify columns and possible operations
+    columns = [col['column_name'] for col in get_schema()]
+    operations = ['greater than or equal to', 'less than or equal to', 'greater than', 'less than', 'equal to', 'not equal to', 'between', 'contains', 'starts with', 'ends with']
+    # Extract conditions from the query
+    conditions = []
+    # Simple heuristic parsing (can be replaced with more advanced NLP techniques)
+    for col in columns:
+        if col in query:
+            for op in operations:
+                if op in query:
+                    pattern = rf"{col}\s+{op}\s+(.*)"
+                    match = re.search(pattern, query)
+                    if match:
+                        value = match.group(1).strip(' "')
+                        sql_condition = ""
+                        # Map operations to SQL syntax
+                        if op == 'greater than or equal to':
+                            sql_condition = f"{col} >= {value}"
+                        elif op == 'less than or equal to':
+                            sql_condition = f"{col} <= {value}"
+                        elif op == 'greater than':
+                            sql_condition = f"{col} > {value}"
+                        elif op == 'less than':
+                            sql_condition = f"{col} < {value}"
+                        elif op == 'equal to':
+                            sql_condition = f"{col} = '{value}'"
+                        elif op == 'not equal to':
+                            sql_condition = f"{col} != '{value}'"
+                        elif op == 'between':
+                            values = value.split(' and ')
+                            if len(values) == 2:
+                                sql_condition = f"{col} BETWEEN {values[0]} AND {values[1]}"
+                        elif op == 'contains':
+                            sql_condition = f"{col} LIKE '%{value}%'"
+                        elif op == 'starts with':
+                            sql_condition = f"{col} LIKE '{value}%'"
+                        elif op == 'ends with':
+                            sql_condition = f"{col} LIKE '%{value}'"
+                        if sql_condition:
+                            conditions.append(sql_condition)
+                        break
+    # Combine conditions with AND
+    if conditions:
+        where_clause = ' AND '.join(conditions)
+    else:
+        where_clause = ''
+    return where_clause
 # Generate SQL based on user query
 def generate_sql_query(query):
     Generates a SQL query based on the natural language input.
     """
     condition = parse_query(query)
+    if condition:
+        sql_query = f"SELECT * FROM contract_data WHERE {condition}"
+    else:
+        sql_query = "SELECT * FROM contract_data"
     return sql_query
 # Execute the SQL query and return results or error
     ## Instructions
     1. **Describe the data you want to retrieve**: For example:
+       - `Show all awards greater than 1,000,000 in California`
+       - `List awardees who received multiple awards along with award amounts`
+       - `Number of awards issued by each department division`
+       - `Distribution of awards by city and zip code across different countries`
+       - `Active awards with their award numbers and dates`
     2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
     3. **Execute Query**: Click "Execute Query" to run the query and view the results.
     4. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
     ## Example Queries
+    - `award greater than 1000000 and state equal to "CA"`
+    - `List awards where department_ind_agency contains "Defense"`
     """)
     with gr.Tabs():
                 with gr.Column(scale=1):
                     query = gr.Textbox(
                         label="Natural Language Query",
+                        placeholder='e.g., "Show all awards greater than 1,000,000 in California"',
                         lines=4
                     )
                     btn_generate = gr.Button("Generate SQL")