import json import openai import gradio as gr import duckdb from functools import lru_cache import pandas as pd import plotly.express as px import os # ========================= # Configuration and Setup # ========================= # Set OpenAI API key openai.api_key = os.getenv("OPENAI_API_KEY") # Load the Parquet dataset path dataset_path = 'sample_contract_df.parquet' # Update with your Parquet file path # Provided schema schema = [ {"column_name": "department_ind_agency", "column_type": "VARCHAR"}, {"column_name": "cgac", "column_type": "BIGINT"}, {"column_name": "sub_tier", "column_type": "VARCHAR"}, {"column_name": "fpds_code", "column_type": "VARCHAR"}, {"column_name": "office", "column_type": "VARCHAR"}, {"column_name": "aac_code", "column_type": "VARCHAR"}, {"column_name": "posteddate", "column_type": "VARCHAR"}, {"column_name": "type", "column_type": "VARCHAR"}, {"column_name": "basetype", "column_type": "VARCHAR"}, {"column_name": "popstreetaddress", "column_type": "VARCHAR"}, {"column_name": "popcity", "column_type": "VARCHAR"}, {"column_name": "popstate", "column_type": "VARCHAR"}, {"column_name": "popzip", "column_type": "VARCHAR"}, {"column_name": "popcountry", "column_type": "VARCHAR"}, {"column_name": "active", "column_type": "VARCHAR"}, {"column_name": "awardnumber", "column_type": "VARCHAR"}, {"column_name": "awarddate", "column_type": "VARCHAR"}, {"column_name": "award", "column_type": "DOUBLE"}, {"column_name": "awardee", "column_type": "VARCHAR"}, {"column_name": "state", "column_type": "VARCHAR"}, {"column_name": "city", "column_type": "VARCHAR"}, {"column_name": "zipcode", "column_type": "VARCHAR"}, {"column_name": "countrycode", "column_type": "VARCHAR"} ] @lru_cache(maxsize=1) def get_schema(): return schema COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()} # ========================= # Database Interaction # ========================= def load_dataset_schema(): """ Loads the dataset schema into DuckDB by creating a view. """ con = duckdb.connect() try: con.execute("DROP VIEW IF EXISTS contract_data") con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'") return True except Exception as e: print(f"Error loading dataset schema: {e}") return False finally: con.close() # Load the dataset schema at startup load_dataset_schema() # ========================= # OpenAI API Integration # ========================= def parse_query(nl_query): """ Converts a natural language query into a SQL query using OpenAI's API. """ messages = [ {"role": "system", "content": "You are an assistant that converts natural language queries into SQL queries for the 'contract_data' table."}, {"role": "user", "content": f"Schema:\n{json.dumps(schema, indent=2)}\n\nQuery:\n\"{nl_query}\"\n\nSQL:"} ] try: response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, temperature=0, max_tokens=150, ) sql_query = response.choices[0].message.content.strip() return sql_query except Exception as e: return f"Error generating SQL query: {e}" # ========================= # Plotting Utilities # ========================= def detect_plot_intent(nl_query): """ Detects if the user's query involves plotting. """ plot_keywords = ['plot', 'graph', 'chart', 'distribution', 'visualize', 'trend', 'histogram', 'bar', 'line', 'scatter', 'pie'] return any(keyword in nl_query.lower() for keyword in plot_keywords) def generate_plot(nl_query, result_df): """ Generates a Plotly figure based on the result DataFrame and the user's intent. """ if not detect_plot_intent(nl_query): return None, "" columns = result_df.columns.tolist() if len(columns) < 2: return None, "Not enough data to generate a plot." # Simple heuristic to choose plot type based on keywords if 'bar' in nl_query.lower(): fig = px.bar(result_df, x=columns[0], y=columns[1], title='Bar Chart') elif 'line' in nl_query.lower(): fig = px.line(result_df, x=columns[0], y=columns[1], title='Line Chart') elif 'scatter' in nl_query.lower(): fig = px.scatter(result_df, x=columns[0], y=columns[1], title='Scatter Plot') elif 'pie' in nl_query.lower(): fig = px.pie(result_df, names=columns[0], values=columns[1], title='Pie Chart') else: # Default to bar chart fig = px.bar(result_df, x=columns[0], y=columns[1], title='Bar Chart') fig.update_layout(title_x=0.5) return fig, "" # ========================= # Gradio Application UI # ========================= with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gr.Markdown("""

Parquet Data Explorer

Query and visualize your data effortlessly.

""", elem_id="main-title") with gr.Row(): with gr.Column(scale=1): query = gr.Textbox( label="Your Query", placeholder='e.g., "What are the total awards over 1M in California?"', lines=1 ) # Hidden schema display that appears on focus schema_display = gr.JSON( label="Dataset Schema", value=get_schema(), interactive=False, visible=False ) error_out = gr.Markdown( value="", visible=False ) with gr.Column(scale=2): results_out = gr.DataFrame( label="Results", interactive=False ) plot_out = gr.Plot( label="Visualization" ) gr.Markdown(""" """) # ========================= # Click Event Handlers # ========================= def on_query_submit(nl_query): """ Handles the submission of a natural language query. """ if not nl_query.strip(): return gr.update(visible=True, value="Please enter a query."), None, None sql_query = parse_query(nl_query) if sql_query.startswith("Error"): return gr.update(visible=True, value=sql_query), None, None result_df, error_msg = execute_query(sql_query) if error_msg: return gr.update(visible=True, value=error_msg), None, None fig, plot_error = generate_plot(nl_query, result_df) if plot_error: return gr.update(visible=True, value=plot_error), None, None return gr.update(visible=False, value=""), result_df, fig def on_input_focus(): """ Shows the dataset schema when the input box is focused. """ return gr.update(visible=True) # ========================= # Assign Event Handlers # ========================= query.submit( fn=on_query_submit, inputs=query, outputs=[error_out, results_out, plot_out] ) query.focus( fn=lambda: gr.update(visible=True), inputs=None, outputs=schema_display ) # ========================= # Helper Functions # ========================= def execute_query(sql_query): """ Executes the SQL query and returns the results. """ try: con = duckdb.connect() con.execute("PRAGMA threads=4") # Optimize for performance con.execute("DROP VIEW IF EXISTS contract_data") con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'") result_df = con.execute(sql_query).fetchdf() con.close() return result_df, "" except Exception as e: return None, f"Error executing query: {e}" # ========================= # Launch the Gradio App # ========================= if __name__ == "__main__": demo.launch()