LeonceNsh commited on
Commit
b474ae1
Β·
verified Β·
1 Parent(s): 4a92356

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -62
app.py CHANGED
@@ -1,53 +1,29 @@
1
- import json
2
  import os
3
- import urllib.parse
4
  import gradio as gr
5
- import requests
6
- from huggingface_hub import InferenceClient
7
-
8
- # InferenceClient setup (you must add your HF token as an environment variable in Colab)
9
- client = InferenceClient(
10
- "meta-llama/Meta-Llama-3.1-70B-Instruct",
11
- token=os.environ["HF_TOKEN"],
12
- )
13
 
14
- # Function to generate iframe for dataset viewer
15
- def get_iframe(hub_repo_id, sql_query=None):
16
- if not hub_repo_id:
17
- raise ValueError("Hub repo id is required")
18
- if sql_query:
19
- sql_query = urllib.parse.quote(sql_query)
20
- url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer?sql_console=true&sql={sql_query}"
21
- else:
22
- url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
23
- iframe = f"""
24
- <iframe src="{url}" frameborder="0" width="100%" height="800px"></iframe>
25
- """
26
- return iframe
27
 
28
- # Function to fetch dataset column information
29
- def get_column_info(hub_repo_id):
30
- url = f"https://datasets-server.huggingface.co/info?dataset={hub_repo_id}"
31
- response = requests.get(url)
32
- try:
33
- data = response.json()
34
- dataset_info = data.get("dataset_info")
35
- key = list(dataset_info.keys())[0]
36
- features = json.dumps(dataset_info.get(key).get("features"), indent=2)
37
- except Exception as e:
38
- return f"Error getting column info: {e}"
39
- return features
40
 
41
- # Function to generate SQL query based on natural language input
42
- def query_dataset(hub_repo_id, features, query):
43
  messages = [
44
  {
45
  "role": "system",
46
- "content": "You are a SQL query expert assistant that returns a DuckDB SQL query based on the user's natural language query and dataset features.",
47
  },
48
  {
49
  "role": "user",
50
- "content": f"""table train
51
  # Features
52
  {features}
53
  # Query
@@ -55,36 +31,53 @@ def query_dataset(hub_repo_id, features, query):
55
  """,
56
  },
57
  ]
58
- response = client.chat_completion(
59
- messages=messages,
60
- max_tokens=1000,
61
- stream=False,
62
- )
63
- query = response.choices[0].message.content
64
- return query, get_iframe(hub_repo_id, query)
 
 
 
 
65
 
66
  # Gradio app UI
67
  with gr.Blocks() as demo:
68
  gr.Markdown("""
69
- # πŸ₯ πŸ¦™ πŸ€— Text To SQL Hub Datasets πŸ€— πŸ¦™ πŸ₯
70
- Use this tool to search and query datasets on Huggingface Hub.
71
- Built with DuckDB, Huggingface's Inference API, and LLaMA 3.1 70B.
72
  """)
 
 
 
 
 
 
 
 
 
 
 
 
73
  with gr.Row():
74
- with gr.Column():
75
- search_in = gr.Textbox(label="Search Huggingface Hub", placeholder="Search for datasets")
76
- query = gr.Textbox(label="Natural Language Query", placeholder="Enter a query to generate SQL")
77
- sql_out = gr.Code(label="SQL Query", language="sql")
78
- with gr.Row():
79
- btn = gr.Button("Show Dataset")
80
- btn2 = gr.Button("Query Dataset")
81
- with gr.Row():
82
- search_out = gr.HTML(label="Search Results")
83
- features = gr.Code(label="Features", language="json")
84
 
85
- # Event handling
86
- btn.click(fn=get_iframe, inputs=[search_in], outputs=[search_out])
87
- btn2.click(fn=query_dataset, inputs=[search_in, features, query], outputs=[sql_out, search_out])
 
 
 
 
 
 
 
 
 
 
88
 
89
  # Launch the app
90
  demo.launch()
 
 
1
  import os
2
+ import json
3
  import gradio as gr
4
+ import duckdb
 
 
 
 
 
 
 
5
 
6
+ # Load the Parquet dataset
7
+ dataset_path = '/content/sample_contract_df.parquet' # Update with your Parquet file's path
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ # Load the dataset with DuckDB and get schema information
10
+ def load_dataset():
11
+ con = duckdb.connect()
12
+ con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
13
+ schema = con.execute("DESCRIBE contract_data").fetchdf()
14
+ con.close()
15
+ return schema.to_dict(orient="records")
 
 
 
 
 
16
 
17
+ # Generate SQL based on schema and user query
18
+ def generate_sql_query(features, query):
19
  messages = [
20
  {
21
  "role": "system",
22
+ "content": "You are a SQL query expert assistant that generates DuckDB SQL queries based on the user's natural language query and dataset schema.",
23
  },
24
  {
25
  "role": "user",
26
+ "content": f"""table contract_data
27
  # Features
28
  {features}
29
  # Query
 
31
  """,
32
  },
33
  ]
34
+ # Here we use DuckDB directly instead of an external API
35
+ sql_query = f"SELECT * FROM contract_data WHERE {query}" # Simple example; adapt for complex queries
36
+ return sql_query
37
+
38
+ # Execute the SQL query and display results
39
+ def execute_query(sql_query):
40
+ con = duckdb.connect()
41
+ con.execute(f"CREATE VIEW contract_data AS SELECT * FROM '{dataset_path}'")
42
+ result_df = con.execute(sql_query).fetchdf()
43
+ con.close()
44
+ return result_df.to_markdown() # Convert result to markdown for display
45
 
46
  # Gradio app UI
47
  with gr.Blocks() as demo:
48
  gr.Markdown("""
49
+ # Local Parquet SQL Query App
50
+ Query and explore the data stored in `sample_contract_df.parquet` using DuckDB and SQL queries.
 
51
  """)
52
+
53
+ # Display schema
54
+ schema = load_dataset()
55
+ features = json.dumps(schema, indent=2)
56
+ gr.Markdown(f"### Dataset Schema:\n\n```json\n{features}\n```")
57
+
58
+ # User inputs for natural language query
59
+ query = gr.Textbox(label="Natural Language Query", placeholder="Enter a condition, e.g., 'amount > 1000'")
60
+ sql_out = gr.Code(label="Generated SQL Query", language="sql")
61
+ results_out = gr.Markdown(label="Query Results")
62
+
63
+ # Buttons to generate and execute SQL
64
  with gr.Row():
65
+ btn_generate = gr.Button("Generate SQL")
66
+ btn_execute = gr.Button("Execute Query")
 
 
 
 
 
 
 
 
67
 
68
+ # Generate SQL on button click
69
+ btn_generate.click(
70
+ fn=generate_sql_query,
71
+ inputs=[features, query],
72
+ outputs=sql_out,
73
+ )
74
+
75
+ # Execute SQL on button click
76
+ btn_execute.click(
77
+ fn=execute_query,
78
+ inputs=sql_out,
79
+ outputs=results_out,
80
+ )
81
 
82
  # Launch the app
83
  demo.launch()