Spaces:

LeonceNsh
/

usgov-contracts-rag

Sleeping

App Files Files Community

usgov-contracts-rag / app.py

LeonceNsh

Update app.py

a1792a1 verified 8 months ago

raw

history blame

8.5 kB

	import json
	import gradio as gr
	import duckdb
	from functools import lru_cache
	import pandas as pd
	import plotly.express as px
	import openai
	import os

	# =========================
	# Configuration and Setup
	# =========================

	# Load the Parquet dataset path
	dataset_path = 'sample_contract_df.parquet' # Update with your Parquet file path

	# Provided schema
	schema = [
	{"column_name": "department_ind_agency", "column_type": "VARCHAR"},
	{"column_name": "cgac", "column_type": "BIGINT"},
	{"column_name": "sub_tier", "column_type": "VARCHAR"},
	{"column_name": "fpds_code", "column_type": "VARCHAR"},
	{"column_name": "office", "column_type": "VARCHAR"},
	{"column_name": "aac_code", "column_type": "VARCHAR"},
	{"column_name": "posteddate", "column_type": "VARCHAR"},
	{"column_name": "type", "column_type": "VARCHAR"},
	{"column_name": "basetype", "column_type": "VARCHAR"},
	{"column_name": "popstreetaddress", "column_type": "VARCHAR"},
	{"column_name": "popcity", "column_type": "VARCHAR"},
	{"column_name": "popstate", "column_type": "VARCHAR"},
	{"column_name": "popzip", "column_type": "VARCHAR"},
	{"column_name": "popcountry", "column_type": "VARCHAR"},
	{"column_name": "active", "column_type": "VARCHAR"},
	{"column_name": "awardnumber", "column_type": "VARCHAR"},
	{"column_name": "awarddate", "column_type": "VARCHAR"},
	{"column_name": "award", "column_type": "DOUBLE"},
	{"column_name": "awardee", "column_type": "VARCHAR"},
	{"column_name": "state", "column_type": "VARCHAR"},
	{"column_name": "city", "column_type": "VARCHAR"},
	{"column_name": "zipcode", "column_type": "VARCHAR"},
	{"column_name": "countrycode", "column_type": "VARCHAR"}
	]

	# Cache the schema loading
	@lru_cache(maxsize=1)
	def get_schema():
	return schema

	# Map column names to their types
	COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}

	# =========================
	# Database Interaction
	# =========================

	def load_dataset_schema():
	"""
	Loads the dataset schema into DuckDB by creating a view.
	"""
	con = duckdb.connect()
	try:
	# Drop the view if it exists to avoid errors
	con.execute("DROP VIEW IF EXISTS contract_data")
	con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
	return True
	except Exception as e:
	print(f"Error loading dataset schema: {e}")
	return False
	finally:
	con.close()

	# =========================
	# OpenAI API Integration
	# =========================

	async def parse_query(nl_query):
	"""
	Converts a natural language query into a SQL query using OpenAI's GPT-4-turbo model.
	"""
	messages = [
	{"role": "system", "content": (
	"You are an assistant that converts natural language queries into SQL queries "
	"for a DuckDB database named 'contract_data'. Use the provided schema to form accurate SQL queries."
	)},
	{"role": "user", "content": (
	f"Schema:\n{json.dumps(schema, indent=2)}\n\nNatural Language Query:\n\"{nl_query}\"\n\nSQL Query:"
	)}
	]

	try:
	response = await openai.ChatCompletion.acreate(
	model="gpt-3.5-turbo",
	messages=messages,
	temperature=0,
	max_tokens=150,
	)
	sql_query = response.choices[0].message['content'].strip()
	return sql_query
	except Exception as e:
	return f"Error generating SQL query: {e}"

	# =========================
	# Plotting Utilities
	# =========================

	def detect_plot_intent(nl_query):
	"""
	Detects if the user's query involves plotting based on the presence of specific keywords.
	"""
	plot_keywords = [
	'plot', 'graph', 'chart', 'distribution', 'visualize', 'histogram',
	'bar chart', 'line chart', 'scatter plot', 'pie chart'
	]
	for keyword in plot_keywords:
	if keyword in nl_query.lower():
	return True
	return False

	async def generate_sql_and_plot_code(query):
	"""
	Generates SQL query and plotting code based on the natural language input.
	"""
	is_plot = detect_plot_intent(query)
	sql_query = await parse_query(query)
	plot_code = ""
	if is_plot and not sql_query.startswith("Error"):
	# Generate plot code based on the query
	plot_code = """
	import plotly.express as px
	fig = px.bar(result_df, x='x_column', y='y_column', title='Generated Plot')
	fig.update_layout(title_x=0.5)
	"""
	return sql_query, plot_code

	def execute_query(sql_query):
	"""
	Executes the SQL query and returns results or an error message.
	"""
	if sql_query.startswith("Error"):
	return None, sql_query # Pass the error message forward

	try:
	con = duckdb.connect()
	con.execute(f"CREATE OR REPLACE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
	result_df = con.execute(sql_query).fetchdf()
	con.close()
	return result_df, ""
	except Exception as e:
	return None, f"Error executing query: {e}"

	def generate_plot(plot_code, result_df):
	"""
	Executes the plot code to generate a plot from the result DataFrame.
	"""
	if not plot_code.strip():
	return None, "No plot code provided."
	try:
	if result_df.empty:
	return None, "Result DataFrame is empty."
	columns = result_df.columns.tolist()
	if len(columns) < 2:
	return None, "Not enough columns to plot."
	plot_code = plot_code.replace('x_column', columns[0])
	plot_code = plot_code.replace('y_column', columns[1])

	local_vars = {'result_df': result_df, 'px': px}
	exec(plot_code, {}, local_vars)
	fig = local_vars.get('fig', None)
	return fig, "" if fig else "Plot could not be generated."
	except Exception as e:
	return None, f"Error generating plot: {e}"

	# =========================
	# Schema Display
	# =========================

	@lru_cache(maxsize=1)
	def get_schema_json():
	return json.dumps(get_schema(), indent=2)

	# =========================
	# Initialize Dataset Schema
	# =========================

	if not load_dataset_schema():
	raise Exception("Failed to load dataset schema. Please check the dataset path and format.")

	# =========================
	# Gradio Application UI
	# =========================

	with gr.Blocks() as demo:
	gr.Markdown("""
	# Parquet SQL Query and Plotting App

	Query and visualize data in `sample_contract_df.parquet`
	""")

	with gr.Tabs():
	with gr.TabItem("Query Data"):
	with gr.Row():
	with gr.Column(scale=1):
	query = gr.Textbox(
	label="Natural Language Query",
	placeholder='e.g., "Show all awards greater than 1,000,000 in California"',
	lines=4
	)
	btn_generate = gr.Button("Generate SQL")
	sql_out = gr.Code(label="Generated SQL Query", language="sql")
	plot_code_out = gr.Code(label="Generated Plot Code", language="python")
	btn_execute = gr.Button("Execute Query")
	error_out = gr.Markdown("", visible=False)
	with gr.Column(scale=2):
	results_out = gr.Dataframe(label="Query Results", interactive=False)
	plot_out = gr.Plot(label="Plot")

	with gr.TabItem("Dataset Schema"):
	gr.Markdown("### Dataset Schema")
	schema_display = gr.JSON(label="Schema", value=json.loads(get_schema_json()))

	async def on_generate_click(nl_query):
	sql_query, plot_code = await generate_sql_and_plot_code(nl_query)
	return sql_query, plot_code

	def on_execute_click(sql_query, plot_code):
	result_df, error_msg = execute_query(sql_query)
	if error_msg:
	return None, None, error_msg
	if plot_code.strip():
	fig, plot_error = generate_plot(plot_code, result_df)
	return result_df, fig, plot_error if plot_error else ""
	else:
	return result_df, None, ""

	btn_generate.click(
	fn=on_generate_click,
	inputs=query,
	outputs=[sql_out, plot_code_out],
	)
	btn_execute.click(
	fn=on_execute_click,
	inputs=[sql_out, plot_code_out],
	outputs=[results_out, plot_out, error_out],
	)

	demo.launch()