|
import json |
|
import os |
|
import urllib.parse |
|
|
|
import gradio as gr |
|
import requests |
|
from gradio_huggingfacehub_search import HuggingfaceHubSearch |
|
from huggingface_hub import InferenceClient |
|
|
|
example = HuggingfaceHubSearch().example_value() |
|
|
|
client = InferenceClient( |
|
"meta-llama/Meta-Llama-3.1-70B-Instruct", |
|
token=os.environ["HF_TOKEN"], |
|
) |
|
|
|
|
|
def get_iframe(hub_repo_id, sql_query=None): |
|
if sql_query: |
|
sql_query = urllib.parse.quote(sql_query) |
|
url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer?sql_console=true&sql={sql_query}" |
|
else: |
|
url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer" |
|
iframe = f""" |
|
<iframe |
|
src="{url}" |
|
frameborder="0" |
|
width="100%" |
|
height="800px" |
|
></iframe> |
|
""" |
|
return iframe |
|
|
|
|
|
def get_column_info(hub_repo_id): |
|
url: str = f"https://datasets-server.huggingface.co/info?dataset={hub_repo_id}" |
|
response = requests.get(url) |
|
try: |
|
data = response.json() |
|
data = data.get("dataset_info") |
|
key = list(data.keys())[0] |
|
features: str = json.dumps(data.get(key).get("features")) |
|
except Exception as e: |
|
gr.Error(f"Error getting column info: {e}") |
|
return features |
|
|
|
|
|
def query_dataset(hub_repo_id, features, query): |
|
messages = [ |
|
{ |
|
"role": "system", |
|
"content": "You are a helpful assistant that returns a DuckDB SQL query based on the user's query and dataset features. Only return the SQL query, no other text.", |
|
}, |
|
{ |
|
"role": "user", |
|
"content": f"""table train |
|
# Features |
|
{features} |
|
|
|
# Query |
|
{query} |
|
""", |
|
}, |
|
] |
|
response = client.chat_completion( |
|
messages=messages, |
|
max_tokens=1000, |
|
stream=False, |
|
) |
|
query = response.choices[0].message.content |
|
return query, get_iframe(hub_repo_id, query) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("""# π₯ π¦ π€ Text To SQL Hub Datasets π€ π¦ π₯ |
|
|
|
This is a basic text to SQL tool that allows you to query datasets on Huggingface Hub. |
|
It is built with [DuckDB](https://duckdb.org/), [Huggingface's Inference API](https://huggingface.co/docs/api-inference/index), and [LLama 3.1 70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct). |
|
Also, it uses the [dataset-server API](https://redocly.github.io/redoc/?url=https://datasets-server.huggingface.co/openapi.json#operation/isValidDataset). |
|
""") |
|
with gr.Row(): |
|
with gr.Column(): |
|
search_in = HuggingfaceHubSearch( |
|
label="Search Huggingface Hub", |
|
placeholder="Search for models on Huggingface", |
|
search_type="dataset", |
|
) |
|
|
|
btn = gr.Button("Show Dataset") |
|
with gr.Row(): |
|
search_out = gr.HTML(label="Search Results") |
|
with gr.Row(): |
|
features = gr.Code(label="Features", language="json", visible=False) |
|
with gr.Row(): |
|
query = gr.Textbox( |
|
label="Natural Language Query", |
|
placeholder="Enter a natural language query to generate SQL", |
|
) |
|
with gr.Row(): |
|
sql_out = gr.Code(label="SQL Query") |
|
with gr.Row(): |
|
btn2 = gr.Button("Query Dataset") |
|
|
|
gr.on( |
|
[btn.click, search_in.submit], |
|
fn=get_iframe, |
|
inputs=[search_in], |
|
outputs=[search_out], |
|
).then( |
|
fn=get_column_info, |
|
inputs=[search_in], |
|
outputs=[features], |
|
) |
|
|
|
btn2.click( |
|
fn=query_dataset, |
|
inputs=[search_in, features, query], |
|
outputs=[sql_out, search_out], |
|
) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|