|
import gradio as gr |
|
from gradio_huggingfacehub_search import HuggingfaceHubSearch |
|
import nbformat as nbf |
|
from huggingface_hub import HfApi |
|
from httpx import Client |
|
import logging |
|
from huggingface_hub import InferenceClient |
|
import json |
|
import re |
|
|
|
""" |
|
TODOs: |
|
- Refactor |
|
- Make the notebook generation more dynamic, add loading components to do not freeze the UI |
|
- Fix errors: |
|
- When generating output |
|
- When parsing output |
|
- When pushing notebook |
|
- Parametrize the commands (Move to another file) |
|
- Use an LLM to suggest commands by column types |
|
- Add target tasks to choose for the notebook: |
|
- Exploratory data analysis |
|
- Auto training |
|
- RAG |
|
- etc. |
|
- Enable 'generate notebook' button only if dataset is available and supports library |
|
- First get compatible-libraries and let user choose the library |
|
""" |
|
|
|
|
|
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" |
|
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"} |
|
client = Client(headers=HEADERS) |
|
inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") |
|
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
|
|
def get_compatible_libraries(dataset: str): |
|
resp = client.get( |
|
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}" |
|
) |
|
resp.raise_for_status() |
|
return resp.json() |
|
|
|
import pandas as pd |
|
|
|
def generate_eda_prompt(columns_info, df, first_code): |
|
|
|
sample_data = df.head(5).to_dict(orient='records') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
format_instructions = """ |
|
The output should be a markdown code snippet formatted in the |
|
following schema, including the leading and trailing "```json" and "```": |
|
|
|
```json |
|
[ |
|
{ |
|
"cell_type": string // This refers either is a markdown or code cell type. |
|
"source": list of string // This is the list of text or python code. |
|
} |
|
] |
|
``` |
|
""" |
|
|
|
prompt = """ |
|
You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure: |
|
|
|
Columns and Data Types: |
|
{columns_info} |
|
|
|
Sample Data: |
|
{sample_data} |
|
|
|
Please create a pandas EDA notebook that includes the following: |
|
|
|
1. Summary statistics for numerical columns. |
|
2. Distribution plots for numerical columns. |
|
3. Bar plots or count plots for categorical columns. |
|
4. Correlation matrix and heatmap for numerical columns. |
|
5. Any additional relevant visualizations or analyses you deem appropriate. |
|
|
|
Ensure the notebook is well-organized, with explanations for each step. |
|
|
|
It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way: |
|
|
|
{first_code} |
|
|
|
{format_instructions} |
|
""" |
|
return prompt.format(columns_info=columns_info, sample_data=sample_data, first_code=first_code, format_instructions=format_instructions) |
|
|
|
def create_notebook_file(cell_commands, notebook_name): |
|
nb = nbf.v4.new_notebook() |
|
nb["cells"] = [nbf.v4.new_code_cell(command['source']) if command['cell_type'] == 'code' else nbf.v4.new_markdown_cell(command['source']) for command in cell_commands] |
|
|
|
|
|
with open(notebook_name, "w") as f: |
|
nbf.write(nb, f) |
|
logging.info(f"Notebook {notebook_name} created successfully") |
|
|
|
|
|
def push_notebook(file_path, dataset_id, token): |
|
notebook_name = "dataset_analysis.ipynb" |
|
api = HfApi(token=token) |
|
try: |
|
api.upload_file( |
|
path_or_fileobj=file_path, |
|
path_in_repo=notebook_name, |
|
repo_id=dataset_id, |
|
repo_type="dataset", |
|
) |
|
link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}" |
|
return gr.HTML( |
|
value=f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>', |
|
visible=True, |
|
) |
|
except Exception as err: |
|
logging.error(f"Failed to push notebook: {err}") |
|
return gr.HTML(value="Failed to push notebook", visible=True) |
|
|
|
def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int): |
|
resp = client.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}") |
|
resp.raise_for_status() |
|
content = resp.json() |
|
rows = content["rows"] |
|
rows = [row['row'] for row in rows] |
|
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit) |
|
features = content['features'] |
|
features_dict = {feature['name']: feature['type'] for feature in features} |
|
return features_dict, first_rows_df |
|
|
|
|
|
def content_from_output(output): |
|
pattern = r'`json(.*?)`' |
|
logging.info("--------> Getting data from output") |
|
match = re.search(pattern, output, re.DOTALL) |
|
if not match: |
|
pattern = r'```(.*?)```' |
|
logging.info("--------> Getting data from output, second try") |
|
match = re.search(pattern, output, re.DOTALL) |
|
if not match: |
|
raise Exception("Unable to generate jupyter notebook.") |
|
extracted_text = match.group(1) |
|
logging.info(extracted_text) |
|
|
|
|
|
def get_notebook_cells(prompt): |
|
messages = [{"role": "user", "content": prompt}] |
|
output = inference_client.chat_completion(messages=messages, max_tokens=2500) |
|
output = (output.choices[0].message.content) |
|
logging.info(output) |
|
pattern = r'`json(.*?)`' |
|
logging.info("--------> Getting data from output") |
|
match = re.search(pattern, output, re.DOTALL) |
|
if not match: |
|
raise Exception("Unable to generate jupyter notebook.") |
|
extracted_text = match.group(1) |
|
logging.info(extracted_text) |
|
content = json.loads(extracted_text) |
|
logging.info(content) |
|
return content |
|
|
|
def generate_notebook(dataset_id): |
|
|
|
|
|
|
|
|
|
try: |
|
libraries = get_compatible_libraries(dataset_id) |
|
except Exception as err: |
|
gr.Error('Unable to retrieve dataset info from HF Hub.') |
|
logging.error(f"Failed to fetch compatible libraries: {err}") |
|
return None |
|
|
|
if not libraries: |
|
gr.Warning('Dataset not compatible with pandas library.') |
|
logging.error(f"Dataset not compatible with pandas library") |
|
return gr.File(visible=False), gr.Row.update(visible=False) |
|
|
|
pandas_library = next( |
|
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"), |
|
None, |
|
) |
|
if not pandas_library: |
|
gr.Warning('Dataset not compatible with pandas library.') |
|
logging.error(f"Dataset not compatible with pandas library") |
|
return gr.File(visible=False), gr.Row.update(visible=False) |
|
|
|
first_config_loading_code = pandas_library['loading_codes'][0] |
|
first_code = first_config_loading_code['code'] |
|
|
|
first_config = first_config_loading_code['config_name'] |
|
first_split = list(first_config_loading_code['arguments']['splits'].keys())[0] |
|
logging.info(f"First config: {first_config} - first split: {first_split}") |
|
first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}" |
|
logging.info(f"First split file: {first_file}") |
|
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>" |
|
features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3) |
|
prompt = generate_eda_prompt(features, df, first_code) |
|
logging.info(f"Prompt: {prompt}") |
|
commands = get_notebook_cells(prompt) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb" |
|
create_notebook_file(commands, notebook_name=notebook_name) |
|
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True) |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# 🤖 Dataset notebook creator 🕵️") |
|
dataset_name = HuggingfaceHubSearch( |
|
label="Hub Dataset ID", |
|
placeholder="Search for dataset id on Huggingface", |
|
search_type="dataset", |
|
value="", |
|
) |
|
|
|
@gr.render(inputs=dataset_name) |
|
def embed(name): |
|
if not name: |
|
return gr.Markdown("### No dataset provided") |
|
html_code = f""" |
|
<iframe |
|
src="https://huggingface.co/datasets/{name}/embed/viewer/default/train" |
|
frameborder="0" |
|
width="100%" |
|
height="350px" |
|
></iframe> |
|
""" |
|
return gr.HTML(value=html_code) |
|
|
|
generate_btn = gr.Button("Generate notebook") |
|
download_link = gr.File(label="Download notebook", visible=False) |
|
with gr.Row(visible=False) as auth_page: |
|
with gr.Column(): |
|
gr.Markdown( |
|
"Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):" |
|
) |
|
token_box = gr.Textbox( |
|
"", label="token", placeholder="hf_xxx", type="password" |
|
) |
|
auth_error = gr.Markdown("", visible=False) |
|
|
|
push_btn = gr.Button("Push notebook to hub", visible=False) |
|
output_lbl = gr.HTML(value="", visible=False) |
|
|
|
generate_btn.click( |
|
generate_notebook, |
|
inputs=[dataset_name], |
|
outputs=[download_link, auth_page], |
|
) |
|
|
|
def auth(token): |
|
if not token: |
|
return { |
|
auth_error: gr.Markdown(value="", visible=False), |
|
push_btn: gr.Button(visible=False), |
|
} |
|
return { |
|
auth_error: gr.Markdown(value="", visible=False), |
|
push_btn: gr.Button("Push notebook to hub", visible=True), |
|
} |
|
|
|
token_box.change( |
|
auth, |
|
inputs=token_box, |
|
outputs=[auth_error, push_btn], |
|
) |
|
|
|
push_btn.click( |
|
push_notebook, |
|
inputs=[download_link, dataset_name, token_box], |
|
outputs=output_lbl, |
|
) |
|
|
|
demo.launch() |
|
|