Spaces:

LeonceNsh
/

usgov-contracts-rag

Sleeping

App Files Files Community

LeonceNsh commited on Nov 1, 2024

Commit

88c83f6

verified ·

1 Parent(s): 00c05fa

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -115

app.py CHANGED Viewed

@@ -4,11 +4,11 @@ import duckdb
 from functools import lru_cache
 import pandas as pd
 import plotly.express as px
-import openai
 import os
 # Set OpenAI API key
-openai.api_key = os.getenv("OPENAI_API_KEY")
 # =========================
 # Configuration and Setup
@@ -21,35 +21,13 @@ dataset_path = 'sample_contract_df.parquet'  # Update with your Parquet file pat
 schema = [
     {"column_name": "department_ind_agency", "column_type": "VARCHAR"},
     {"column_name": "cgac", "column_type": "BIGINT"},
-    {"column_name": "sub_tier", "column_type": "VARCHAR"},
-    {"column_name": "fpds_code", "column_type": "VARCHAR"},
-    {"column_name": "office", "column_type": "VARCHAR"},
-    {"column_name": "aac_code", "column_type": "VARCHAR"},
-    {"column_name": "posteddate", "column_type": "VARCHAR"},
-    {"column_name": "type", "column_type": "VARCHAR"},
-    {"column_name": "basetype", "column_type": "VARCHAR"},
-    {"column_name": "popstreetaddress", "column_type": "VARCHAR"},
-    {"column_name": "popcity", "column_type": "VARCHAR"},
-    {"column_name": "popstate", "column_type": "VARCHAR"},
-    {"column_name": "popzip", "column_type": "VARCHAR"},
-    {"column_name": "popcountry", "column_type": "VARCHAR"},
-    {"column_name": "active", "column_type": "VARCHAR"},
-    {"column_name": "awardnumber", "column_type": "VARCHAR"},
-    {"column_name": "awarddate", "column_type": "VARCHAR"},
-    {"column_name": "award", "column_type": "DOUBLE"},
-    {"column_name": "awardee", "column_type": "VARCHAR"},
-    {"column_name": "state", "column_type": "VARCHAR"},
-    {"column_name": "city", "column_type": "VARCHAR"},
-    {"column_name": "zipcode", "column_type": "VARCHAR"},
-    {"column_name": "countrycode", "column_type": "VARCHAR"}
 ]
-# Cache the schema loading
 @lru_cache(maxsize=1)
 def get_schema():
     return schema
-# Map column names to their types
 COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
 # =========================
@@ -62,7 +40,6 @@ def load_dataset_schema():
     """
     con = duckdb.connect()
     try:
-        # Drop the view if it exists to avoid errors
         con.execute("DROP VIEW IF EXISTS contract_data")
         con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
         return True
@@ -78,28 +55,21 @@ def load_dataset_schema():
 async def parse_query(nl_query):
     """
-    Converts a natural language query into a SQL query using OpenAI's GPT-3.5-turbo model.
     """
     messages = [
-        {"role": "system",
-         "content": (
-             "You are an assistant that converts natural language queries into SQL queries "
-             "for a DuckDB database named 'contract_data'. Use the provided schema to form accurate SQL queries.")
-        },
-        {"role": "user",
-         "content": f"Schema:\n{json.dumps(schema, indent=2)}\n\nNatural Language Query:\n\"{nl_query}\"\n\nSQL Query:"
-        }
     ]
     try:
-        response = await openai.ChatCompletion.acreate(
             model="gpt-3.5-turbo",
             messages=messages,
-            temperature=0,  # Set to 0 for deterministic output
             max_tokens=150,
         )
-        sql_query = response.choices[0].message['content'].strip()
         return sql_query
     except Exception as e:
         return f"Error generating SQL query: {e}"
@@ -110,27 +80,19 @@ async def parse_query(nl_query):
 def detect_plot_intent(nl_query):
     """
-    Detects if the user's query involves plotting based on the presence of specific keywords.
     """
-    plot_keywords = [
-        'plot', 'graph', 'chart', 'distribution', 'visualize', 'histogram',
-        'bar chart', 'line chart', 'scatter plot', 'pie chart'
-    ]
-    for keyword in plot_keywords:
-        if keyword in nl_query.lower():
-            return True
-        return False
 async def generate_sql_and_plot_code(query):
     """
-    Generates SQL query and plotting code based on the natural language input.
     """
     is_plot = detect_plot_intent(query)
     sql_query = await parse_query(query)
     plot_code = ""
     if is_plot and not sql_query.startswith("Error"):
-        # Generate plot code based on the query
-        # For simplicity, we'll generate a basic plot code
         plot_code = """
 import plotly.express as px
 fig = px.bar(result_df, x='x_column', y='y_column', title='Generated Plot')
@@ -140,20 +102,18 @@ fig.update_layout(title_x=0.5)
 def execute_query(sql_query):
     """
-    Executes the SQL query and returns results or an error message.
     """
     if sql_query.startswith("Error"):
-        return None, sql_query  # Pass the error message forward
     try:
         con = duckdb.connect()
-        # Ensure the view is created
         con.execute(f"CREATE OR REPLACE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
         result_df = con.execute(sql_query).fetchdf()
         con.close()
         return result_df, ""
     except Exception as e:
-        # In case of error, return None and error message
         return None, f"Error executing query: {e}"
 def generate_plot(plot_code, result_df):
@@ -163,41 +123,18 @@ def generate_plot(plot_code, result_df):
     if not plot_code.strip():
         return None, "No plot code provided."
     try:
-        # Replace placeholders in plot_code with actual column names
-        if result_df.empty:
-            return None, "Result DataFrame is empty."
         columns = result_df.columns.tolist()
         if len(columns) < 2:
             return None, "Not enough columns to plot."
         plot_code = plot_code.replace('x_column', columns[0])
         plot_code = plot_code.replace('y_column', columns[1])
-        # Execute the plot code
         local_vars = {'result_df': result_df, 'px': px}
         exec(plot_code, {}, local_vars)
         fig = local_vars.get('fig', None)
-        if fig:
-            return fig, ""
-        else:
-            return None, "Plot could not be generated."
     except Exception as e:
         return None, f"Error generating plot: {e}"
-# =========================
-# Schema Display
-# =========================
-@lru_cache(maxsize=1)
-def get_schema_json():
-    return json.dumps(get_schema(), indent=2)
-# =========================
-# Initialize Dataset Schema
-# =========================
-if not load_dataset_schema():
-    raise Exception("Failed to load dataset schema. Please check the dataset path and format.")
 # =========================
 # Gradio Application UI
 # =========================
@@ -210,36 +147,17 @@ with gr.Blocks() as demo:
     ## Instructions
-    1. **Describe the data you want to retrieve or plot**: For example:
-       - `Show all awards greater than 1,000,000 in California`
-       - `Plot the distribution of awards by state`
-       - `Show a bar chart of total awards per department`
-       - `List awardees who received multiple awards along with award amounts`
-       - `Number of awards issued by each department division`
-    2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
-    3. **Execute Query**: Click "Execute Query" to run the query and view the results.
-    4. **View Plot**: If your query involves plotting, the plot will be displayed.
-    5. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
-    ## Example Queries
-    - `Plot the total award amount by state`
-    - `Show a histogram of awards over time`
-    - `award greater than 1000000 and state equal to "CA"`
-    - `List awards where department_ind_agency contains "Defense"`
     """)
     with gr.Tabs():
-        # Query Tab
         with gr.TabItem("Query Data"):
             with gr.Row():
                 with gr.Column(scale=1):
-                    query = gr.Textbox(
-                        label="Natural Language Query",
-                        placeholder='e.g., "Show all awards greater than 1,000,000 in California"',
-                        lines=4
-                    )
                     btn_generate = gr.Button("Generate SQL")
                     sql_out = gr.Code(label="Generated SQL Query", language="sql")
                     plot_code_out = gr.Code(label="Generated Plot Code", language="python")
@@ -249,10 +167,9 @@ with gr.Blocks() as demo:
                     results_out = gr.Dataframe(label="Query Results", interactive=False)
                     plot_out = gr.Plot(label="Plot")
-        # Schema Tab
         with gr.TabItem("Dataset Schema"):
             gr.Markdown("### Dataset Schema")
-            schema_display = gr.JSON(label="Schema", value=json.loads(get_schema_json()))
     # =========================
     # Click Event Handlers
@@ -281,16 +198,8 @@ with gr.Blocks() as demo:
         else:
             return result_df, None, ""
-    btn_generate.click(
-        fn=on_generate_click,
-        inputs=query,
-        outputs=[sql_out, plot_code_out],
-    )
-    btn_execute.click(
-        fn=on_execute_click,
-        inputs=[sql_out, plot_code_out],
-        outputs=[results_out, plot_out, error_out],
-    )
 # =========================
 # Launch the Gradio App

 from functools import lru_cache
 import pandas as pd
 import plotly.express as px
 import os
+from openai import OpenAI
 # Set OpenAI API key
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 # =========================
 # Configuration and Setup
 schema = [
     {"column_name": "department_ind_agency", "column_type": "VARCHAR"},
     {"column_name": "cgac", "column_type": "BIGINT"},
+    # Additional columns go here...
 ]
 @lru_cache(maxsize=1)
 def get_schema():
     return schema
 COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
 # =========================
     """
     con = duckdb.connect()
     try:
         con.execute("DROP VIEW IF EXISTS contract_data")
         con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
         return True
 async def parse_query(nl_query):
     """
+    Converts a natural language query into a SQL query using OpenAI's API.
     """
     messages = [
+        {"role": "system", "content": "Convert natural language queries to SQL queries for 'contract_data'."},
+        {"role": "user", "content": f"Schema:\n{json.dumps(schema, indent=2)}\n\nQuery:\n\"{nl_query}\"\n\nSQL:"}
     ]
     try:
+        response = await client.chat.completions.create(
             model="gpt-3.5-turbo",
             messages=messages,
+            temperature=0,
             max_tokens=150,
         )
+        sql_query = response.choices[0].message.content.strip()
         return sql_query
     except Exception as e:
         return f"Error generating SQL query: {e}"
 def detect_plot_intent(nl_query):
     """
+    Detects if the user's query involves plotting.
     """
+    plot_keywords = ['plot', 'graph', 'chart', 'distribution', 'visualize']
+    return any(keyword in nl_query.lower() for keyword in plot_keywords)
 async def generate_sql_and_plot_code(query):
     """
+    Generates SQL query and optional plotting code.
     """
     is_plot = detect_plot_intent(query)
     sql_query = await parse_query(query)
     plot_code = ""
     if is_plot and not sql_query.startswith("Error"):
         plot_code = """
 import plotly.express as px
 fig = px.bar(result_df, x='x_column', y='y_column', title='Generated Plot')
 def execute_query(sql_query):
     """
+    Executes the SQL query and returns the results.
     """
     if sql_query.startswith("Error"):
+        return None, sql_query
     try:
         con = duckdb.connect()
         con.execute(f"CREATE OR REPLACE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
         result_df = con.execute(sql_query).fetchdf()
         con.close()
         return result_df, ""
     except Exception as e:
         return None, f"Error executing query: {e}"
 def generate_plot(plot_code, result_df):
     if not plot_code.strip():
         return None, "No plot code provided."
     try:
         columns = result_df.columns.tolist()
         if len(columns) < 2:
             return None, "Not enough columns to plot."
         plot_code = plot_code.replace('x_column', columns[0])
         plot_code = plot_code.replace('y_column', columns[1])
         local_vars = {'result_df': result_df, 'px': px}
         exec(plot_code, {}, local_vars)
         fig = local_vars.get('fig', None)
+        return fig, "" if fig else "Plot could not be generated."
     except Exception as e:
         return None, f"Error generating plot: {e}"
 # =========================
 # Gradio Application UI
 # =========================
     ## Instructions
+    1. **Describe the data you want**: e.g., `Show awards over 1M in CA`
+    2. **Generate SQL**: Click "Generate SQL" to see the SQL query.
+    3. **Execute Query**: Run the query to view results and plots.
+    4. **Dataset Schema**: See available columns and types in the "Schema" tab.
     """)
     with gr.Tabs():
         with gr.TabItem("Query Data"):
             with gr.Row():
                 with gr.Column(scale=1):
+                    query = gr.Textbox(label="Natural Language Query", placeholder='e.g., "Awards > 1M in CA"')
                     btn_generate = gr.Button("Generate SQL")
                     sql_out = gr.Code(label="Generated SQL Query", language="sql")
                     plot_code_out = gr.Code(label="Generated Plot Code", language="python")
                     results_out = gr.Dataframe(label="Query Results", interactive=False)
                     plot_out = gr.Plot(label="Plot")
         with gr.TabItem("Dataset Schema"):
             gr.Markdown("### Dataset Schema")
+            schema_display = gr.JSON(label="Schema", value=json.loads(json.dumps(get_schema(), indent=2)))
     # =========================
     # Click Event Handlers
         else:
             return result_df, None, ""
+    btn_generate.click(fn=on_generate_click, inputs=query, outputs=[sql_out, plot_code_out])
+    btn_execute.click(fn=on_execute_click, inputs=[sql_out, plot_code_out], outputs=[results_out, plot_out, error_out])
 # =========================
 # Launch the Gradio App