Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,101 +1,182 @@
|
|
1 |
import json
|
2 |
import gradio as gr
|
3 |
import duckdb
|
|
|
|
|
4 |
|
5 |
# Load the Parquet dataset path
|
6 |
dataset_path = 'sample_contract_df.parquet' # Update with your Parquet file path
|
7 |
|
8 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
def load_dataset_schema():
|
10 |
con = duckdb.connect()
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
#
|
17 |
def parse_query(nl_query):
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
'greater than or equal to'
|
27 |
-
'less than or equal to'
|
28 |
-
'greater than'
|
29 |
-
'less than'
|
30 |
-
'
|
31 |
-
'
|
32 |
-
'
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
return sql_query
|
38 |
|
39 |
-
#
|
40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
41 |
condition = parse_query(query)
|
42 |
sql_query = f"SELECT * FROM contract_data WHERE {condition}"
|
43 |
return sql_query
|
44 |
|
45 |
-
# Execute the SQL query and
|
46 |
def execute_query(sql_query):
|
|
|
|
|
|
|
47 |
try:
|
48 |
con = duckdb.connect()
|
49 |
-
|
|
|
50 |
result_df = con.execute(sql_query).fetchdf()
|
51 |
con.close()
|
52 |
return result_df, ""
|
53 |
except Exception as e:
|
54 |
-
|
55 |
return None, f"Error executing query: {e}"
|
56 |
|
57 |
-
#
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
60 |
|
61 |
# Gradio app UI
|
62 |
with gr.Blocks() as demo:
|
63 |
gr.Markdown("""
|
64 |
# Local Parquet SQL Query App
|
65 |
|
66 |
-
Query and explore data in `sample_contract_df.parquet` using DuckDB and SQL queries.
|
67 |
|
68 |
## Instructions
|
69 |
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
""")
|
75 |
|
76 |
with gr.Tabs():
|
|
|
77 |
with gr.TabItem("Query Data"):
|
78 |
with gr.Row():
|
79 |
with gr.Column(scale=1):
|
80 |
query = gr.Textbox(
|
81 |
label="Natural Language Query",
|
82 |
-
placeholder=
|
|
|
83 |
)
|
84 |
btn_generate = gr.Button("Generate SQL")
|
85 |
sql_out = gr.Code(label="Generated SQL Query", language="sql")
|
86 |
btn_execute = gr.Button("Execute Query")
|
87 |
error_out = gr.Markdown("", visible=False)
|
88 |
with gr.Column(scale=2):
|
89 |
-
results_out = gr.Dataframe(label="Query Results")
|
90 |
|
|
|
91 |
with gr.TabItem("Dataset Schema"):
|
92 |
gr.Markdown("### Dataset Schema")
|
93 |
-
|
94 |
|
95 |
# Set up click events
|
96 |
btn_generate.click(
|
97 |
fn=generate_sql_query,
|
98 |
-
inputs=
|
99 |
outputs=sql_out,
|
100 |
)
|
101 |
btn_execute.click(
|
|
|
1 |
import json
|
2 |
import gradio as gr
|
3 |
import duckdb
|
4 |
+
import re
|
5 |
+
from functools import lru_cache
|
6 |
|
7 |
# Load the Parquet dataset path
|
8 |
dataset_path = 'sample_contract_df.parquet' # Update with your Parquet file path
|
9 |
|
10 |
+
# Provided schema
|
11 |
+
schema = [
|
12 |
+
{"column_name": "department_ind_agency", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
13 |
+
{"column_name": "cgac", "column_type": "BIGINT", "null": "YES", "key": None, "default": None, "extra": None},
|
14 |
+
{"column_name": "sub_tier", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
15 |
+
{"column_name": "fpds_code", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
16 |
+
{"column_name": "office", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
17 |
+
{"column_name": "aac_code", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
18 |
+
{"column_name": "posteddate", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
19 |
+
{"column_name": "type", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
20 |
+
{"column_name": "basetype", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
21 |
+
{"column_name": "popstreetaddress", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
22 |
+
{"column_name": "popcity", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
23 |
+
{"column_name": "popstate", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
24 |
+
{"column_name": "popzip", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
25 |
+
{"column_name": "popcountry", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
26 |
+
{"column_name": "active", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
27 |
+
{"column_name": "awardnumber", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
28 |
+
{"column_name": "awarddate", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
29 |
+
{"column_name": "award", "column_type": "DOUBLE", "null": "YES", "key": None, "default": None, "extra": None},
|
30 |
+
{"column_name": "awardee", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
31 |
+
{"column_name": "state", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
32 |
+
{"column_name": "city", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
33 |
+
{"column_name": "zipcode", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
|
34 |
+
{"column_name": "countrycode", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None}
|
35 |
+
]
|
36 |
+
|
37 |
+
# Cache the schema loading
|
38 |
+
@lru_cache(maxsize=1)
|
39 |
+
def get_schema():
|
40 |
+
return schema
|
41 |
+
|
42 |
+
# Map column names to their types
|
43 |
+
COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
|
44 |
+
|
45 |
+
# Define columns that are numeric
|
46 |
+
NUMERIC_COLUMNS = {col['column_name'] for col in get_schema() if col['column_type'] in ['DOUBLE', 'BIGINT', 'INT', 'FLOAT', 'DECIMAL']}
|
47 |
+
|
48 |
+
# Function to load the dataset schema into DuckDB
|
49 |
+
@lru_cache(maxsize=1)
|
50 |
def load_dataset_schema():
|
51 |
con = duckdb.connect()
|
52 |
+
try:
|
53 |
+
# Drop the view if it exists to avoid errors
|
54 |
+
con.execute("DROP VIEW IF EXISTS contract_data")
|
55 |
+
con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
|
56 |
+
return True
|
57 |
+
except Exception as e:
|
58 |
+
print(f"Error loading dataset schema: {e}")
|
59 |
+
return False
|
60 |
+
finally:
|
61 |
+
con.close()
|
62 |
|
63 |
+
# Advanced Natural Language to SQL Parser using schema
|
64 |
def parse_query(nl_query):
|
65 |
+
"""
|
66 |
+
Converts a natural language query into SQL WHERE conditions based on the schema.
|
67 |
+
"""
|
68 |
+
# Lowercase the query for uniformity
|
69 |
+
query = nl_query.lower()
|
70 |
+
|
71 |
+
# Define patterns for comparison
|
72 |
+
patterns = [
|
73 |
+
(r'(\w+)\s+(greater than or equal to|>=)\s+([\d\.]+)', r"\1 >= \3"),
|
74 |
+
(r'(\w+)\s+(less than or equal to|<=)\s+([\d\.]+)', r"\1 <= \3"),
|
75 |
+
(r'(\w+)\s+(greater than|>)\s+([\d\.]+)', r"\1 > \3"),
|
76 |
+
(r'(\w+)\s+(less than|<)\s+([\d\.]+)', r"\1 < \3"),
|
77 |
+
(r'(\w+)\s+(equal to|=|equals to|equals|equal)\s+\"?([\w\s]+)\"?', r'\1 = "\3"'),
|
78 |
+
(r'(\w+)\s+(not equal to|!=|not equal)\s+\"?([\w\s]+)\"?', r'\1 != "\3"'),
|
79 |
+
(r'between\s+([\d\.]+)\s+and\s+([\d\.]+)', r'BETWEEN \1 AND \2'),
|
80 |
+
(r'(\w+)\s+contains\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2%"'),
|
81 |
+
(r'(\w+)\s+starts with\s+\"?([\w\s]+)\"?', r'\1 LIKE "\2%"'),
|
82 |
+
(r'(\w+)\s+ends with\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2"'),
|
83 |
+
]
|
|
|
84 |
|
85 |
+
# Apply each pattern
|
86 |
+
for pattern, replacement in patterns:
|
87 |
+
query = re.sub(pattern, replacement, query)
|
88 |
+
|
89 |
+
# Handle logical operators
|
90 |
+
query = query.replace(' and ', ' AND ').replace(' or ', ' OR ')
|
91 |
+
|
92 |
+
# Remove any unintended multiple spaces
|
93 |
+
query = re.sub(r'\s+', ' ', query).strip()
|
94 |
+
|
95 |
+
return query
|
96 |
+
|
97 |
+
# Generate SQL based on user query
|
98 |
+
def generate_sql_query(query):
|
99 |
+
"""
|
100 |
+
Generates a SQL query based on the natural language input.
|
101 |
+
"""
|
102 |
condition = parse_query(query)
|
103 |
sql_query = f"SELECT * FROM contract_data WHERE {condition}"
|
104 |
return sql_query
|
105 |
|
106 |
+
# Execute the SQL query and return results or error
|
107 |
def execute_query(sql_query):
|
108 |
+
"""
|
109 |
+
Executes the SQL query and returns the results as a DataFrame.
|
110 |
+
"""
|
111 |
try:
|
112 |
con = duckdb.connect()
|
113 |
+
# Ensure the view is created
|
114 |
+
con.execute(f"CREATE OR REPLACE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
|
115 |
result_df = con.execute(sql_query).fetchdf()
|
116 |
con.close()
|
117 |
return result_df, ""
|
118 |
except Exception as e:
|
119 |
+
# In case of error, return empty dataframe and error message
|
120 |
return None, f"Error executing query: {e}"
|
121 |
|
122 |
+
# Cache the schema JSON for display
|
123 |
+
@lru_cache(maxsize=1)
|
124 |
+
def get_schema_json():
|
125 |
+
return json.dumps(get_schema(), indent=2)
|
126 |
+
|
127 |
+
# Initialize the dataset schema
|
128 |
+
if not load_dataset_schema():
|
129 |
+
raise Exception("Failed to load dataset schema. Please check the dataset path and format.")
|
130 |
|
131 |
# Gradio app UI
|
132 |
with gr.Blocks() as demo:
|
133 |
gr.Markdown("""
|
134 |
# Local Parquet SQL Query App
|
135 |
|
136 |
+
**Query and explore data** in `sample_contract_df.parquet` using DuckDB and natural language SQL queries.
|
137 |
|
138 |
## Instructions
|
139 |
|
140 |
+
1. **Enter a Natural Language Query**: Describe the data you want to retrieve. For example:
|
141 |
+
- `award greater than 1000 and status equal to "approved"`
|
142 |
+
- `department_ind_agency equals "Health and Human Services" or awardee contains "Tech"`
|
143 |
+
2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
|
144 |
+
3. **Execute Query**: Click "Execute Query" to run the query and view the results.
|
145 |
+
4. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
|
146 |
+
|
147 |
+
## Example Queries
|
148 |
+
|
149 |
+
- `award greater than 50000 and state equals "CA"`
|
150 |
+
- `awardee contains "Solutions" or award less than 10000`
|
151 |
+
- `department_ind_agency equals "Defense" and awarddate greater than "2023-01-01"`
|
152 |
""")
|
153 |
|
154 |
with gr.Tabs():
|
155 |
+
# Query Tab
|
156 |
with gr.TabItem("Query Data"):
|
157 |
with gr.Row():
|
158 |
with gr.Column(scale=1):
|
159 |
query = gr.Textbox(
|
160 |
label="Natural Language Query",
|
161 |
+
placeholder='e.g., "award greater than 1000 and state equals \\"CA\\""',
|
162 |
+
lines=4
|
163 |
)
|
164 |
btn_generate = gr.Button("Generate SQL")
|
165 |
sql_out = gr.Code(label="Generated SQL Query", language="sql")
|
166 |
btn_execute = gr.Button("Execute Query")
|
167 |
error_out = gr.Markdown("", visible=False)
|
168 |
with gr.Column(scale=2):
|
169 |
+
results_out = gr.Dataframe(label="Query Results", interactive=False)
|
170 |
|
171 |
+
# Schema Tab
|
172 |
with gr.TabItem("Dataset Schema"):
|
173 |
gr.Markdown("### Dataset Schema")
|
174 |
+
schema_display = gr.JSON(label="Schema", value=json.loads(get_schema_json()))
|
175 |
|
176 |
# Set up click events
|
177 |
btn_generate.click(
|
178 |
fn=generate_sql_query,
|
179 |
+
inputs=query,
|
180 |
outputs=sql_out,
|
181 |
)
|
182 |
btn_execute.click(
|