Spaces:

LeonceNsh
/

usgov-contracts-rag

Sleeping

App Files Files Community

LeonceNsh commited on Nov 1, 2024

Commit

dfe1769

verified ·

1 Parent(s): 776a658

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -88

app.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import json
 import gradio as gr
 import duckdb
-import re
 from functools import lru_cache
 from transformers import pipeline
 # Load the Parquet dataset path
 dataset_path = 'sample_contract_df.parquet'  # Update with your Parquet file path
@@ -58,86 +60,62 @@ def load_dataset_schema():
     finally:
         con.close()
-# Initialize the NLP model for query parsing
-@lru_cache(maxsize=1)
-def get_nlp_model():
-    # We use a zero-shot-classification pipeline for query intent understanding
-    classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
-    return classifier
-# Advanced Natural Language to SQL Parser using NLP
 def parse_query(nl_query):
     """
-    Converts a natural language query into SQL WHERE conditions based on the schema.
     """
-    # Tokenize and normalize the query
-    query = nl_query.lower()
-    # Identify columns and possible operations
-    columns = [col['column_name'] for col in get_schema()]
-    operations = ['greater than or equal to', 'less than or equal to', 'greater than', 'less than', 'equal to', 'not equal to', 'between', 'contains', 'starts with', 'ends with']
-    # Extract conditions from the query
-    conditions = []
-    # Simple heuristic parsing (can be replaced with more advanced NLP techniques)
-    for col in columns:
-        if col in query:
-            for op in operations:
-                if op in query:
-                    pattern = rf"{col}\s+{op}\s+(.*)"
-                    match = re.search(pattern, query)
-                    if match:
-                        value = match.group(1).strip(' "')
-                        sql_condition = ""
-                        # Map operations to SQL syntax
-                        if op == 'greater than or equal to':
-                            sql_condition = f"{col} >= {value}"
-                        elif op == 'less than or equal to':
-                            sql_condition = f"{col} <= {value}"
-                        elif op == 'greater than':
-                            sql_condition = f"{col} > {value}"
-                        elif op == 'less than':
-                            sql_condition = f"{col} < {value}"
-                        elif op == 'equal to':
-                            sql_condition = f"{col} = '{value}'"
-                        elif op == 'not equal to':
-                            sql_condition = f"{col} != '{value}'"
-                        elif op == 'between':
-                            values = value.split(' and ')
-                            if len(values) == 2:
-                                sql_condition = f"{col} BETWEEN {values[0]} AND {values[1]}"
-                        elif op == 'contains':
-                            sql_condition = f"{col} LIKE '%{value}%'"
-                        elif op == 'starts with':
-                            sql_condition = f"{col} LIKE '{value}%'"
-                        elif op == 'ends with':
-                            sql_condition = f"{col} LIKE '%{value}'"
-                        if sql_condition:
-                            conditions.append(sql_condition)
-                        break
-    # Combine conditions with AND
-    if conditions:
-        where_clause = ' AND '.join(conditions)
-    else:
-        where_clause = ''
-    return where_clause
-# Generate SQL based on user query
-def generate_sql_query(query):
     """
-    Generates a SQL query based on the natural language input.
     """
-    condition = parse_query(query)
-    if condition:
-        sql_query = f"SELECT * FROM contract_data WHERE {condition}"
-    else:
-        sql_query = "SELECT * FROM contract_data"
-    return sql_query
 # Execute the SQL query and return results or error
 def execute_query(sql_query):
@@ -152,9 +130,37 @@ def execute_query(sql_query):
         con.close()
         return result_df, ""
     except Exception as e:
-        # In case of error, return empty dataframe and error message
         return None, f"Error executing query: {e}"
 # Cache the schema JSON for display
 @lru_cache(maxsize=1)
 def get_schema_json():
@@ -167,25 +173,28 @@ if not load_dataset_schema():
 # Gradio app UI
 with gr.Blocks() as demo:
     gr.Markdown("""
-    # Parquet SQL Query App
-    **Query data** in `sample_contract_df.parquet`
     ## Instructions
-    1. **Describe the data you want to retrieve**: For example:
        - `Show all awards greater than 1,000,000 in California`
        - `List awardees who received multiple awards along with award amounts`
        - `Number of awards issued by each department division`
-       - `Distribution of awards by city and zip code across different countries`
-       - `Active awards with their award numbers and dates`
     2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
     3. **Execute Query**: Click "Execute Query" to run the query and view the results.
-    4. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
     ## Example Queries
     - `award greater than 1000000 and state equal to "CA"`
     - `List awards where department_ind_agency contains "Defense"`
     """)
@@ -202,10 +211,12 @@ with gr.Blocks() as demo:
                     )
                     btn_generate = gr.Button("Generate SQL")
                     sql_out = gr.Code(label="Generated SQL Query", language="sql")
                     btn_execute = gr.Button("Execute Query")
                     error_out = gr.Markdown("", visible=False)
                 with gr.Column(scale=2):
                     results_out = gr.Dataframe(label="Query Results", interactive=False)
         # Schema Tab
         with gr.TabItem("Dataset Schema"):
@@ -213,15 +224,32 @@ with gr.Blocks() as demo:
             schema_display = gr.JSON(label="Schema", value=json.loads(get_schema_json()))
     # Set up click events
     btn_generate.click(
-        fn=generate_sql_query,
         inputs=query,
-        outputs=sql_out,
     )
     btn_execute.click(
-        fn=execute_query,
-        inputs=sql_out,
-        outputs=[results_out, error_out],
     )
 # Launch the app

 import json
 import gradio as gr
 import duckdb
 from functools import lru_cache
 from transformers import pipeline
+import pandas as pd
+import plotly.express as px
+import openai
 # Load the Parquet dataset path
 dataset_path = 'sample_contract_df.parquet'  # Update with your Parquet file path
     finally:
         con.close()
+# Advanced Natural Language to SQL Parser using OpenAI's GPT-3
 def parse_query(nl_query):
     """
+    Converts a natural language query into SQL query using OpenAI GPT-3.
+    """
+    openai.api_key = 'YOUR_OPENAI_API_KEY'  # Replace with your OpenAI API key
+    prompt = f"""
+    Convert the following natural language query into a SQL query for a DuckDB database. Use 'contract_data' as the table name.
+    Schema:
+    {json.dumps(schema, indent=2)}
+    Query:
+    "{nl_query}"
+    """
+    try:
+        response = openai.Completion.create(
+            engine="text-davinci-003",
+            prompt=prompt,
+            temperature=0,
+            max_tokens=150,
+            top_p=1,
+            n=1,
+            stop=None
+        )
+        sql_query = response.choices[0].text.strip()
+        return sql_query
+    except Exception as e:
+        return f"Error generating SQL query: {e}"
+# Function to detect if the user wants a plot
+def detect_plot_intent(nl_query):
+    """
+    Detects if the user's query involves plotting.
     """
+    plot_keywords = ['plot', 'graph', 'chart', 'distribution', 'visualize', 'histogram', 'bar chart', 'line chart', 'scatter plot', 'pie chart']
+    for keyword in plot_keywords:
+        if keyword in nl_query.lower():
+            return True
+    return False
+# Generate SQL and Plot Code based on user query
+def generate_sql_and_plot_code(query):
     """
+    Generates SQL query and plotting code based on the natural language input.
     """
+    is_plot = detect_plot_intent(query)
+    sql_query = parse_query(query)
+    plot_code = ""
+    if is_plot:
+        # Generate plot code based on the query
+        # For simplicity, we'll generate a basic plot code
+        plot_code = """
+import plotly.express as px
+fig = px.bar(result_df, x='x_column', y='y_column')
+"""
+    return sql_query, plot_code
 # Execute the SQL query and return results or error
 def execute_query(sql_query):
         con.close()
         return result_df, ""
     except Exception as e:
+        # In case of error, return None and error message
         return None, f"Error executing query: {e}"
+# Generate and display plot
+def generate_plot(plot_code, result_df):
+    """
+    Executes the plot code to generate a plot from the result DataFrame.
+    """
+    if not plot_code.strip():
+        return None, "No plot code provided."
+    try:
+        # Replace placeholders in plot_code with actual column names
+        if result_df.empty:
+            return None, "Result DataFrame is empty."
+        columns = result_df.columns.tolist()
+        if len(columns) < 2:
+            return None, "Not enough columns to plot."
+        plot_code = plot_code.replace('x_column', columns[0])
+        plot_code = plot_code.replace('y_column', columns[1])
+        # Execute the plot code
+        local_vars = {'result_df': result_df}
+        exec(plot_code, {'px': px}, local_vars)
+        fig = local_vars.get('fig', None)
+        if fig:
+            return fig, ""
+        else:
+            return None, "Plot could not be generated."
+    except Exception as e:
+        return None, f"Error generating plot: {e}"
 # Cache the schema JSON for display
 @lru_cache(maxsize=1)
 def get_schema_json():
 # Gradio app UI
 with gr.Blocks() as demo:
     gr.Markdown("""
+    # Parquet SQL Query and Plotting App
+    **Query and visualize data** in `sample_contract_df.parquet`
     ## Instructions
+    1. **Describe the data you want to retrieve or plot**: For example:
        - `Show all awards greater than 1,000,000 in California`
+       - `Plot the distribution of awards by state`
+       - `Show a bar chart of total awards per department`
        - `List awardees who received multiple awards along with award amounts`
        - `Number of awards issued by each department division`
     2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
     3. **Execute Query**: Click "Execute Query" to run the query and view the results.
+    4. **View Plot**: If your query involves plotting, the plot will be displayed.
+    5. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
     ## Example Queries
+    - `Plot the total award amount by state`
+    - `Show a histogram of awards over time`
     - `award greater than 1000000 and state equal to "CA"`
     - `List awards where department_ind_agency contains "Defense"`
     """)
                     )
                     btn_generate = gr.Button("Generate SQL")
                     sql_out = gr.Code(label="Generated SQL Query", language="sql")
+                    plot_code_out = gr.Code(label="Generated Plot Code", language="python")
                     btn_execute = gr.Button("Execute Query")
                     error_out = gr.Markdown("", visible=False)
                 with gr.Column(scale=2):
                     results_out = gr.Dataframe(label="Query Results", interactive=False)
+                    plot_out = gr.Plot(label="Plot")
         # Schema Tab
         with gr.TabItem("Dataset Schema"):
             schema_display = gr.JSON(label="Schema", value=json.loads(get_schema_json()))
     # Set up click events
+    def on_generate_click(nl_query):
+        sql_query, plot_code = generate_sql_and_plot_code(nl_query)
+        return sql_query, plot_code
+    def on_execute_click(sql_query, plot_code):
+        result_df, error_msg = execute_query(sql_query)
+        if error_msg:
+            return None, None, error_msg
+        if plot_code.strip():
+            fig, plot_error = generate_plot(plot_code, result_df)
+            if plot_error:
+                return result_df, None, plot_error
+            else:
+                return result_df, fig, ""
+        else:
+            return result_df, None, ""
     btn_generate.click(
+        fn=on_generate_click,
         inputs=query,
+        outputs=[sql_out, plot_code_out],
     )
     btn_execute.click(
+        fn=on_execute_click,
+        inputs=[sql_out, plot_code_out],
+        outputs=[results_out, plot_out, error_out],
     )
 # Launch the app