File size: 3,207 Bytes
06f01b3
 
beeac96
06f01b3
 
 
 
 
 
5b4c268
06f01b3
 
 
 
8cb3a33
5b4c268
06f01b3
 
 
 
 
 
 
 
 
 
 
 
 
5b4c268
06f01b3
 
 
 
 
 
 
 
 
 
 
 
5b4c268
06f01b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5b4c268
06f01b3
 
 
 
 
 
 
8cb3a33
 
06f01b3
 
 
 
 
 
 
 
 
8cb3a33
06f01b3
 
 
5b4c268
06f01b3
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
# Install necessary libraries
!pip install gradio requests duckdb huggingface_hub

import json
import os
import urllib.parse
import gradio as gr
import requests
from huggingface_hub import InferenceClient

# InferenceClient setup (you must add your HF token as an environment variable in Colab)
client = InferenceClient(
    "meta-llama/Meta-Llama-3.1-70B-Instruct",
    token=os.environ["HF_TOKEN"],
)

# Function to generate iframe for dataset viewer
def get_iframe(hub_repo_id, sql_query=None):
    if not hub_repo_id:
        raise ValueError("Hub repo id is required")
    if sql_query:
        sql_query = urllib.parse.quote(sql_query)
        url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer?sql_console=true&sql={sql_query}"
    else:
        url = f"https://huggingface.co/datasets/{hub_repo_id}/embed/viewer"
    iframe = f"""
    <iframe src="{url}" frameborder="0" width="100%" height="800px"></iframe>
    """
    return iframe

# Function to fetch dataset column information
def get_column_info(hub_repo_id):
    url = f"https://datasets-server.huggingface.co/info?dataset={hub_repo_id}"
    response = requests.get(url)
    try:
        data = response.json()
        dataset_info = data.get("dataset_info")
        key = list(dataset_info.keys())[0]
        features = json.dumps(dataset_info.get(key).get("features"), indent=2)
    except Exception as e:
        return f"Error getting column info: {e}"
    return features

# Function to generate SQL query based on natural language input
def query_dataset(hub_repo_id, features, query):
    messages = [
        {
            "role": "system",
            "content": "You are a SQL query expert assistant that returns a DuckDB SQL query based on the user's natural language query and dataset features.",
        },
        {
            "role": "user",
            "content": f"""table train
# Features
{features}
# Query
{query}
""",
        },
    ]
    response = client.chat_completion(
        messages=messages,
        max_tokens=1000,
        stream=False,
    )
    query = response.choices[0].message.content
    return query, get_iframe(hub_repo_id, query)

# Gradio app UI
with gr.Blocks() as demo:
    gr.Markdown("""
    # πŸ₯ πŸ¦™ πŸ€— Text To SQL Hub Datasets πŸ€— πŸ¦™ πŸ₯
    Use this tool to search and query datasets on Huggingface Hub.
    Built with DuckDB, Huggingface's Inference API, and LLaMA 3.1 70B.
    """)
    with gr.Row():
        with gr.Column():
            search_in = gr.Textbox(label="Search Huggingface Hub", placeholder="Search for datasets")
            query = gr.Textbox(label="Natural Language Query", placeholder="Enter a query to generate SQL")
            sql_out = gr.Code(label="SQL Query", language="sql")
    with gr.Row():
        btn = gr.Button("Show Dataset")
        btn2 = gr.Button("Query Dataset")
    with gr.Row():
        search_out = gr.HTML(label="Search Results")
        features = gr.Code(label="Features", language="json")

    # Event handling
    btn.click(fn=get_iframe, inputs=[search_in], outputs=[search_out])
    btn2.click(fn=query_dataset, inputs=[search_in, features, query], outputs=[sql_out, search_out])

# Launch the app
demo.launch()