LeonceNsh commited on
Commit
d33fe62
·
verified ·
1 Parent(s): 8760634

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -43
app.py CHANGED
@@ -1,101 +1,182 @@
1
  import json
2
  import gradio as gr
3
  import duckdb
 
 
4
 
5
  # Load the Parquet dataset path
6
  dataset_path = 'sample_contract_df.parquet' # Update with your Parquet file path
7
 
8
- # Load the dataset schema with DuckDB
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def load_dataset_schema():
10
  con = duckdb.connect()
11
- con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
12
- schema = con.execute("DESCRIBE contract_data").fetchdf()
13
- con.close()
14
- return schema.to_dict(orient="records")
 
 
 
 
 
 
15
 
16
- # Simple parser to convert natural language to SQL syntax
17
  def parse_query(nl_query):
18
- # Replace common phrases with SQL syntax
19
- replacements = {
20
- 'equal to': '=',
21
- 'equals to': '=',
22
- 'equals': '=',
23
- 'equal': '=',
24
- 'not equal to': '!=',
25
- 'not equal': '!=',
26
- 'greater than or equal to': '>=',
27
- 'less than or equal to': '<=',
28
- 'greater than': '>',
29
- 'less than': '<',
30
- 'between': 'BETWEEN',
31
- ' and ': ' AND ',
32
- ' or ': ' OR ',
33
- }
34
- sql_query = nl_query.lower()
35
- for k, v in replacements.items():
36
- sql_query = sql_query.replace(k, v)
37
- return sql_query
38
 
39
- # Generate SQL based on schema and user query
40
- def generate_sql_query(features, query):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  condition = parse_query(query)
42
  sql_query = f"SELECT * FROM contract_data WHERE {condition}"
43
  return sql_query
44
 
45
- # Execute the SQL query and display results
46
  def execute_query(sql_query):
 
 
 
47
  try:
48
  con = duckdb.connect()
49
- con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
 
50
  result_df = con.execute(sql_query).fetchdf()
51
  con.close()
52
  return result_df, ""
53
  except Exception as e:
54
- con.close()
55
  return None, f"Error executing query: {e}"
56
 
57
- # Load dataset schema and convert it to JSON for display
58
- schema = load_dataset_schema()
59
- schema_json = json.dumps(schema, indent=2)
 
 
 
 
 
60
 
61
  # Gradio app UI
62
  with gr.Blocks() as demo:
63
  gr.Markdown("""
64
  # Local Parquet SQL Query App
65
 
66
- Query and explore data in `sample_contract_df.parquet` using DuckDB and SQL queries.
67
 
68
  ## Instructions
69
 
70
- - Enter a natural language query describing the data you want to retrieve.
71
- - Click "Generate SQL" to see the SQL query that will be executed.
72
- - Click "Execute Query" to run the query and see the results.
73
- - You can view the dataset schema in the "Dataset Schema" tab.
 
 
 
 
 
 
 
 
74
  """)
75
 
76
  with gr.Tabs():
 
77
  with gr.TabItem("Query Data"):
78
  with gr.Row():
79
  with gr.Column(scale=1):
80
  query = gr.Textbox(
81
  label="Natural Language Query",
82
- placeholder="e.g., 'amount greater than 1000 and status equal to \"approved\"'"
 
83
  )
84
  btn_generate = gr.Button("Generate SQL")
85
  sql_out = gr.Code(label="Generated SQL Query", language="sql")
86
  btn_execute = gr.Button("Execute Query")
87
  error_out = gr.Markdown("", visible=False)
88
  with gr.Column(scale=2):
89
- results_out = gr.Dataframe(label="Query Results")
90
 
 
91
  with gr.TabItem("Dataset Schema"):
92
  gr.Markdown("### Dataset Schema")
93
- features = gr.Code(label="Dataset Schema", value=schema_json, language="json")
94
 
95
  # Set up click events
96
  btn_generate.click(
97
  fn=generate_sql_query,
98
- inputs=[features, query],
99
  outputs=sql_out,
100
  )
101
  btn_execute.click(
 
1
  import json
2
  import gradio as gr
3
  import duckdb
4
+ import re
5
+ from functools import lru_cache
6
 
7
  # Load the Parquet dataset path
8
  dataset_path = 'sample_contract_df.parquet' # Update with your Parquet file path
9
 
10
+ # Provided schema
11
+ schema = [
12
+ {"column_name": "department_ind_agency", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
13
+ {"column_name": "cgac", "column_type": "BIGINT", "null": "YES", "key": None, "default": None, "extra": None},
14
+ {"column_name": "sub_tier", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
15
+ {"column_name": "fpds_code", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
16
+ {"column_name": "office", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
17
+ {"column_name": "aac_code", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
18
+ {"column_name": "posteddate", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
19
+ {"column_name": "type", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
20
+ {"column_name": "basetype", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
21
+ {"column_name": "popstreetaddress", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
22
+ {"column_name": "popcity", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
23
+ {"column_name": "popstate", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
24
+ {"column_name": "popzip", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
25
+ {"column_name": "popcountry", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
26
+ {"column_name": "active", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
27
+ {"column_name": "awardnumber", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
28
+ {"column_name": "awarddate", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
29
+ {"column_name": "award", "column_type": "DOUBLE", "null": "YES", "key": None, "default": None, "extra": None},
30
+ {"column_name": "awardee", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
31
+ {"column_name": "state", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
32
+ {"column_name": "city", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
33
+ {"column_name": "zipcode", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None},
34
+ {"column_name": "countrycode", "column_type": "VARCHAR", "null": "YES", "key": None, "default": None, "extra": None}
35
+ ]
36
+
37
+ # Cache the schema loading
38
+ @lru_cache(maxsize=1)
39
+ def get_schema():
40
+ return schema
41
+
42
+ # Map column names to their types
43
+ COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
44
+
45
+ # Define columns that are numeric
46
+ NUMERIC_COLUMNS = {col['column_name'] for col in get_schema() if col['column_type'] in ['DOUBLE', 'BIGINT', 'INT', 'FLOAT', 'DECIMAL']}
47
+
48
+ # Function to load the dataset schema into DuckDB
49
+ @lru_cache(maxsize=1)
50
  def load_dataset_schema():
51
  con = duckdb.connect()
52
+ try:
53
+ # Drop the view if it exists to avoid errors
54
+ con.execute("DROP VIEW IF EXISTS contract_data")
55
+ con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
56
+ return True
57
+ except Exception as e:
58
+ print(f"Error loading dataset schema: {e}")
59
+ return False
60
+ finally:
61
+ con.close()
62
 
63
+ # Advanced Natural Language to SQL Parser using schema
64
  def parse_query(nl_query):
65
+ """
66
+ Converts a natural language query into SQL WHERE conditions based on the schema.
67
+ """
68
+ # Lowercase the query for uniformity
69
+ query = nl_query.lower()
70
+
71
+ # Define patterns for comparison
72
+ patterns = [
73
+ (r'(\w+)\s+(greater than or equal to|>=)\s+([\d\.]+)', r"\1 >= \3"),
74
+ (r'(\w+)\s+(less than or equal to|<=)\s+([\d\.]+)', r"\1 <= \3"),
75
+ (r'(\w+)\s+(greater than|>)\s+([\d\.]+)', r"\1 > \3"),
76
+ (r'(\w+)\s+(less than|<)\s+([\d\.]+)', r"\1 < \3"),
77
+ (r'(\w+)\s+(equal to|=|equals to|equals|equal)\s+\"?([\w\s]+)\"?', r'\1 = "\3"'),
78
+ (r'(\w+)\s+(not equal to|!=|not equal)\s+\"?([\w\s]+)\"?', r'\1 != "\3"'),
79
+ (r'between\s+([\d\.]+)\s+and\s+([\d\.]+)', r'BETWEEN \1 AND \2'),
80
+ (r'(\w+)\s+contains\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2%"'),
81
+ (r'(\w+)\s+starts with\s+\"?([\w\s]+)\"?', r'\1 LIKE "\2%"'),
82
+ (r'(\w+)\s+ends with\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2"'),
83
+ ]
 
84
 
85
+ # Apply each pattern
86
+ for pattern, replacement in patterns:
87
+ query = re.sub(pattern, replacement, query)
88
+
89
+ # Handle logical operators
90
+ query = query.replace(' and ', ' AND ').replace(' or ', ' OR ')
91
+
92
+ # Remove any unintended multiple spaces
93
+ query = re.sub(r'\s+', ' ', query).strip()
94
+
95
+ return query
96
+
97
+ # Generate SQL based on user query
98
+ def generate_sql_query(query):
99
+ """
100
+ Generates a SQL query based on the natural language input.
101
+ """
102
  condition = parse_query(query)
103
  sql_query = f"SELECT * FROM contract_data WHERE {condition}"
104
  return sql_query
105
 
106
+ # Execute the SQL query and return results or error
107
  def execute_query(sql_query):
108
+ """
109
+ Executes the SQL query and returns the results as a DataFrame.
110
+ """
111
  try:
112
  con = duckdb.connect()
113
+ # Ensure the view is created
114
+ con.execute(f"CREATE OR REPLACE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
115
  result_df = con.execute(sql_query).fetchdf()
116
  con.close()
117
  return result_df, ""
118
  except Exception as e:
119
+ # In case of error, return empty dataframe and error message
120
  return None, f"Error executing query: {e}"
121
 
122
+ # Cache the schema JSON for display
123
+ @lru_cache(maxsize=1)
124
+ def get_schema_json():
125
+ return json.dumps(get_schema(), indent=2)
126
+
127
+ # Initialize the dataset schema
128
+ if not load_dataset_schema():
129
+ raise Exception("Failed to load dataset schema. Please check the dataset path and format.")
130
 
131
  # Gradio app UI
132
  with gr.Blocks() as demo:
133
  gr.Markdown("""
134
  # Local Parquet SQL Query App
135
 
136
+ **Query and explore data** in `sample_contract_df.parquet` using DuckDB and natural language SQL queries.
137
 
138
  ## Instructions
139
 
140
+ 1. **Enter a Natural Language Query**: Describe the data you want to retrieve. For example:
141
+ - `award greater than 1000 and status equal to "approved"`
142
+ - `department_ind_agency equals "Health and Human Services" or awardee contains "Tech"`
143
+ 2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
144
+ 3. **Execute Query**: Click "Execute Query" to run the query and view the results.
145
+ 4. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
146
+
147
+ ## Example Queries
148
+
149
+ - `award greater than 50000 and state equals "CA"`
150
+ - `awardee contains "Solutions" or award less than 10000`
151
+ - `department_ind_agency equals "Defense" and awarddate greater than "2023-01-01"`
152
  """)
153
 
154
  with gr.Tabs():
155
+ # Query Tab
156
  with gr.TabItem("Query Data"):
157
  with gr.Row():
158
  with gr.Column(scale=1):
159
  query = gr.Textbox(
160
  label="Natural Language Query",
161
+ placeholder='e.g., "award greater than 1000 and state equals \\"CA\\""',
162
+ lines=4
163
  )
164
  btn_generate = gr.Button("Generate SQL")
165
  sql_out = gr.Code(label="Generated SQL Query", language="sql")
166
  btn_execute = gr.Button("Execute Query")
167
  error_out = gr.Markdown("", visible=False)
168
  with gr.Column(scale=2):
169
+ results_out = gr.Dataframe(label="Query Results", interactive=False)
170
 
171
+ # Schema Tab
172
  with gr.TabItem("Dataset Schema"):
173
  gr.Markdown("### Dataset Schema")
174
+ schema_display = gr.JSON(label="Schema", value=json.loads(get_schema_json()))
175
 
176
  # Set up click events
177
  btn_generate.click(
178
  fn=generate_sql_query,
179
+ inputs=query,
180
  outputs=sql_out,
181
  )
182
  btn_execute.click(