LeonceNsh commited on
Commit
7012184
·
verified ·
1 Parent(s): 1f06762

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -43
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  import duckdb
4
  import re
5
  from functools import lru_cache
 
6
 
7
  # Load the Parquet dataset path
8
  dataset_path = 'sample_contract_df.parquet' # Update with your Parquet file path
@@ -42,9 +43,6 @@ def get_schema():
42
  # Map column names to their types
43
  COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
44
 
45
- # Define columns that are numeric
46
- NUMERIC_COLUMNS = {col['column_name'] for col in get_schema() if col['column_type'] in ['DOUBLE', 'BIGINT', 'INT', 'FLOAT', 'DECIMAL']}
47
-
48
  # Function to load the dataset schema into DuckDB
49
  @lru_cache(maxsize=1)
50
  def load_dataset_schema():
@@ -60,39 +58,74 @@ def load_dataset_schema():
60
  finally:
61
  con.close()
62
 
63
- # Advanced Natural Language to SQL Parser using schema
 
 
 
 
 
 
 
64
  def parse_query(nl_query):
65
  """
66
  Converts a natural language query into SQL WHERE conditions based on the schema.
67
  """
68
- # Lowercase the query for uniformity
69
  query = nl_query.lower()
70
 
71
- # Define patterns for comparison
72
- patterns = [
73
- (r'(\w+)\s+(greater than or equal to|>=)\s+([\d\.]+)', r"\1 >= \3"),
74
- (r'(\w+)\s+(less than or equal to|<=)\s+([\d\.]+)', r"\1 <= \3"),
75
- (r'(\w+)\s+(greater than|>)\s+([\d\.]+)', r"\1 > \3"),
76
- (r'(\w+)\s+(less than|<)\s+([\d\.]+)', r"\1 < \3"),
77
- (r'(\w+)\s+(equal to|=|equals to|equals|equal)\s+\"?([\w\s]+)\"?', r'\1 = "\3"'),
78
- (r'(\w+)\s+(not equal to|!=|not equal)\s+\"?([\w\s]+)\"?', r'\1 != "\3"'),
79
- (r'between\s+([\d\.]+)\s+and\s+([\d\.]+)', r'BETWEEN \1 AND \2'),
80
- (r'(\w+)\s+contains\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2%"'),
81
- (r'(\w+)\s+starts with\s+\"?([\w\s]+)\"?', r'\1 LIKE "\2%"'),
82
- (r'(\w+)\s+ends with\s+\"?([\w\s]+)\"?', r'\1 LIKE "%\2"'),
83
- ]
84
-
85
- # Apply each pattern
86
- for pattern, replacement in patterns:
87
- query = re.sub(pattern, replacement, query)
88
-
89
- # Handle logical operators
90
- query = query.replace(' and ', ' AND ').replace(' or ', ' OR ')
91
-
92
- # Remove any unintended multiple spaces
93
- query = re.sub(r'\s+', ' ', query).strip()
94
-
95
- return query
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
  # Generate SQL based on user query
98
  def generate_sql_query(query):
@@ -100,7 +133,10 @@ def generate_sql_query(query):
100
  Generates a SQL query based on the natural language input.
101
  """
102
  condition = parse_query(query)
103
- sql_query = f"SELECT * FROM contract_data WHERE {condition}"
 
 
 
104
  return sql_query
105
 
106
  # Execute the SQL query and return results or error
@@ -138,23 +174,20 @@ with gr.Blocks() as demo:
138
  ## Instructions
139
 
140
  1. **Describe the data you want to retrieve**: For example:
141
- - `award greater than 1000000`
142
- - `department_ind_agency equals "Health and Human Services"`
143
- - What is the total value of awards granted by each state in the year 2022?
144
- - Which awardees received multiple awards, and what are the corresponding award amounts?
145
- - How many awards were issued by each department division, such as the Department of Defense or Veterans Affairs?
146
- - What is the distribution of awards by city and zip code across different countries?
147
- - Which awards are currently active, and what are their respective award numbers and dates?
148
-
149
-
150
  2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
151
  3. **Execute Query**: Click "Execute Query" to run the query and view the results.
152
  4. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
153
 
154
  ## Example Queries
155
 
156
- - `award > 720000000`
157
- - `award greater than 10000000`
158
  """)
159
 
160
  with gr.Tabs():
@@ -164,7 +197,7 @@ with gr.Blocks() as demo:
164
  with gr.Column(scale=1):
165
  query = gr.Textbox(
166
  label="Natural Language Query",
167
- placeholder='e.g., "award greater than 1000 and state equals \\"CA\\""',
168
  lines=4
169
  )
170
  btn_generate = gr.Button("Generate SQL")
 
3
  import duckdb
4
  import re
5
  from functools import lru_cache
6
+ from transformers import pipeline
7
 
8
  # Load the Parquet dataset path
9
  dataset_path = 'sample_contract_df.parquet' # Update with your Parquet file path
 
43
  # Map column names to their types
44
  COLUMN_TYPES = {col['column_name']: col['column_type'] for col in get_schema()}
45
 
 
 
 
46
  # Function to load the dataset schema into DuckDB
47
  @lru_cache(maxsize=1)
48
  def load_dataset_schema():
 
58
  finally:
59
  con.close()
60
 
61
+ # Initialize the NLP model for query parsing
62
+ @lru_cache(maxsize=1)
63
+ def get_nlp_model():
64
+ # We use a zero-shot-classification pipeline for query intent understanding
65
+ classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
66
+ return classifier
67
+
68
+ # Advanced Natural Language to SQL Parser using NLP
69
  def parse_query(nl_query):
70
  """
71
  Converts a natural language query into SQL WHERE conditions based on the schema.
72
  """
73
+ # Tokenize and normalize the query
74
  query = nl_query.lower()
75
 
76
+ # Identify columns and possible operations
77
+ columns = [col['column_name'] for col in get_schema()]
78
+ operations = ['greater than or equal to', 'less than or equal to', 'greater than', 'less than', 'equal to', 'not equal to', 'between', 'contains', 'starts with', 'ends with']
79
+
80
+ # Extract conditions from the query
81
+ conditions = []
82
+
83
+ # Simple heuristic parsing (can be replaced with more advanced NLP techniques)
84
+ for col in columns:
85
+ if col in query:
86
+ for op in operations:
87
+ if op in query:
88
+ pattern = rf"{col}\s+{op}\s+(.*)"
89
+ match = re.search(pattern, query)
90
+ if match:
91
+ value = match.group(1).strip(' "')
92
+ sql_condition = ""
93
+
94
+ # Map operations to SQL syntax
95
+ if op == 'greater than or equal to':
96
+ sql_condition = f"{col} >= {value}"
97
+ elif op == 'less than or equal to':
98
+ sql_condition = f"{col} <= {value}"
99
+ elif op == 'greater than':
100
+ sql_condition = f"{col} > {value}"
101
+ elif op == 'less than':
102
+ sql_condition = f"{col} < {value}"
103
+ elif op == 'equal to':
104
+ sql_condition = f"{col} = '{value}'"
105
+ elif op == 'not equal to':
106
+ sql_condition = f"{col} != '{value}'"
107
+ elif op == 'between':
108
+ values = value.split(' and ')
109
+ if len(values) == 2:
110
+ sql_condition = f"{col} BETWEEN {values[0]} AND {values[1]}"
111
+ elif op == 'contains':
112
+ sql_condition = f"{col} LIKE '%{value}%'"
113
+ elif op == 'starts with':
114
+ sql_condition = f"{col} LIKE '{value}%'"
115
+ elif op == 'ends with':
116
+ sql_condition = f"{col} LIKE '%{value}'"
117
+
118
+ if sql_condition:
119
+ conditions.append(sql_condition)
120
+ break
121
+
122
+ # Combine conditions with AND
123
+ if conditions:
124
+ where_clause = ' AND '.join(conditions)
125
+ else:
126
+ where_clause = ''
127
+
128
+ return where_clause
129
 
130
  # Generate SQL based on user query
131
  def generate_sql_query(query):
 
133
  Generates a SQL query based on the natural language input.
134
  """
135
  condition = parse_query(query)
136
+ if condition:
137
+ sql_query = f"SELECT * FROM contract_data WHERE {condition}"
138
+ else:
139
+ sql_query = "SELECT * FROM contract_data"
140
  return sql_query
141
 
142
  # Execute the SQL query and return results or error
 
174
  ## Instructions
175
 
176
  1. **Describe the data you want to retrieve**: For example:
177
+ - `Show all awards greater than 1,000,000 in California`
178
+ - `List awardees who received multiple awards along with award amounts`
179
+ - `Number of awards issued by each department division`
180
+ - `Distribution of awards by city and zip code across different countries`
181
+ - `Active awards with their award numbers and dates`
182
+
 
 
 
183
  2. **Generate SQL**: Click "Generate SQL" to see the SQL query that will be executed.
184
  3. **Execute Query**: Click "Execute Query" to run the query and view the results.
185
  4. **View Dataset Schema**: Check the "Dataset Schema" tab to understand available columns and their types.
186
 
187
  ## Example Queries
188
 
189
+ - `award greater than 1000000 and state equal to "CA"`
190
+ - `List awards where department_ind_agency contains "Defense"`
191
  """)
192
 
193
  with gr.Tabs():
 
197
  with gr.Column(scale=1):
198
  query = gr.Textbox(
199
  label="Natural Language Query",
200
+ placeholder='e.g., "Show all awards greater than 1,000,000 in California"',
201
  lines=4
202
  )
203
  btn_generate = gr.Button("Generate SQL")