Spaces:
Sleeping
Sleeping
import gradio as gr | |
from gradio_huggingfacehub_search import HuggingfaceHubSearch | |
import nbformat as nbf | |
from huggingface_hub import HfApi | |
from httpx import Client | |
import logging | |
from huggingface_hub import InferenceClient | |
import json | |
import re | |
""" | |
TODOs: | |
- Refactor | |
- Make the notebook generation more dynamic, add loading components to do not freeze the UI | |
- Fix errors: | |
- When generating output | |
- When parsing output | |
- When pushing notebook | |
- Parametrize the commands (Move to another file) | |
- Use an LLM to suggest commands by column types | |
- Add target tasks to choose for the notebook: | |
- Exploratory data analysis | |
- Auto training | |
- RAG | |
- etc. | |
- Enable 'generate notebook' button only if dataset is available and supports library | |
- First get compatible-libraries and let user choose the library | |
""" | |
# Configuration | |
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" | |
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"} | |
client = Client(headers=HEADERS) | |
inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") | |
logging.basicConfig(level=logging.INFO) | |
def get_compatible_libraries(dataset: str): | |
resp = client.get( | |
f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}" | |
) | |
resp.raise_for_status() | |
return resp.json() | |
import pandas as pd | |
def generate_eda_prompt(columns_info, df, first_code): | |
# columns_info = df.dtypes.to_dict() | |
sample_data = df.head(5).to_dict(orient='records') | |
# prompt = ( | |
# "You are an expert data analyst tasked with generating an exploratory data analysis (EDA) jupyter notebook. " | |
# "The data is provided as a pandas DataFrame with the following structure:\n\n" | |
# f"Columns and Data Types:\n{columns_info}\n\n" | |
# f"Sample Data:\n{sample_data}\n\n" | |
# "Please create a pandas EDA notebook that includes the following:\n" | |
# "1. Summary statistics for numerical columns.\n" | |
# "2. Distribution plots for numerical columns.\n" | |
# "3. Bar plots or count plots for categorical columns.\n" | |
# "4. Correlation matrix and heatmap for numerical columns.\n" | |
# "5. Any other relevant visualizations or analyses you deem appropriate.\n\n" | |
# "Ensure the notebook is well-organized, with explanations for each step." | |
# f"You can use the following code to load the dataset:\n\n{first_code}\n" | |
# """The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n" | |
# ```json | |
# [ | |
# { | |
# "cell_type": string // This refers either is a markdown or code cell type. | |
# "source": list of string // This is the list of text or python code. | |
# } | |
# ] | |
# ``` | |
# Do not include more information than necessary, as this will be used to generate the notebook. | |
# """ | |
# ) | |
format_instructions = """ | |
The output should be a markdown code snippet formatted in the | |
following schema, including the leading and trailing "```json" and "```": | |
```json | |
[ | |
{ | |
"cell_type": string // This refers either is a markdown or code cell type. | |
"source": list of string // This is the list of text or python code. | |
} | |
] | |
``` | |
""" | |
prompt = """ | |
You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure: | |
Columns and Data Types: | |
{columns_info} | |
Sample Data: | |
{sample_data} | |
Please create a pandas EDA notebook that includes the following: | |
1. Summary statistics for numerical columns. | |
2. Distribution plots for numerical columns. | |
3. Bar plots or count plots for categorical columns. | |
4. Correlation matrix and heatmap for numerical columns. | |
5. Any additional relevant visualizations or analyses you deem appropriate. | |
Ensure the notebook is well-organized, with explanations for each step. | |
It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way: | |
{first_code} | |
{format_instructions} | |
""" | |
return prompt.format(columns_info=columns_info, sample_data=sample_data, first_code=first_code, format_instructions=format_instructions) | |
def create_notebook_file(cell_commands, notebook_name): | |
nb = nbf.v4.new_notebook() | |
nb["cells"] = [nbf.v4.new_code_cell(command['source']) if command['cell_type'] == 'code' else nbf.v4.new_markdown_cell(command['source']) for command in cell_commands] | |
with open(notebook_name, "w") as f: | |
nbf.write(nb, f) | |
logging.info(f"Notebook {notebook_name} created successfully") | |
def push_notebook(file_path, dataset_id, token): | |
notebook_name = "dataset_analysis.ipynb" | |
api = HfApi(token=token) | |
try: | |
api.upload_file( | |
path_or_fileobj=file_path, | |
path_in_repo=notebook_name, | |
repo_id=dataset_id, | |
repo_type="dataset", | |
) | |
link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}" | |
return gr.HTML( | |
value=f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline; text-decoration-style: dotted;">See notebook</a>', | |
visible=True, | |
) | |
except Exception as err: | |
logging.error(f"Failed to push notebook: {err}") | |
return gr.HTML(value="Failed to push notebook", visible=True) | |
def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int): | |
resp = client.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}") | |
resp.raise_for_status() | |
content = resp.json() | |
rows = content["rows"] | |
rows = [row['row'] for row in rows] | |
first_rows_df = pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit) | |
features = content['features'] | |
features_dict = {feature['name']: feature['type'] for feature in features} | |
return features_dict, first_rows_df | |
def content_from_output(output): | |
pattern = r'`json(.*?)`' | |
logging.info("--------> Getting data from output") | |
match = re.search(pattern, output, re.DOTALL) | |
if not match: | |
pattern = r'```(.*?)```' | |
logging.info("--------> Getting data from output, second try") | |
match = re.search(pattern, output, re.DOTALL) | |
if not match: | |
raise Exception("Unable to generate jupyter notebook.") | |
extracted_text = match.group(1) | |
logging.info(extracted_text) | |
def get_notebook_cells(prompt): | |
messages = [{"role": "user", "content": prompt}] | |
output = inference_client.chat_completion(messages=messages, max_tokens=2500) | |
output = (output.choices[0].message.content) | |
logging.info(output) | |
pattern = r'`json(.*?)`' | |
logging.info("--------> Getting data from output") | |
match = re.search(pattern, output, re.DOTALL) | |
if not match: | |
raise Exception("Unable to generate jupyter notebook.") | |
extracted_text = match.group(1) | |
logging.info(extracted_text) | |
content = json.loads(extracted_text) | |
logging.info(content) | |
return content | |
def generate_notebook(dataset_id): | |
#TODO: Load dataframe from notebook here | |
# generate_eda_prompt | |
try: | |
libraries = get_compatible_libraries(dataset_id) | |
except Exception as err: | |
gr.Error('Unable to retrieve dataset info from HF Hub.') | |
logging.error(f"Failed to fetch compatible libraries: {err}") | |
return None | |
if not libraries: | |
gr.Warning('Dataset not compatible with pandas library.') | |
logging.error(f"Dataset not compatible with pandas library") | |
return gr.File(visible=False), gr.Row.update(visible=False) | |
pandas_library = next( | |
(lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"), | |
None, | |
) | |
if not pandas_library: | |
gr.Warning('Dataset not compatible with pandas library.') | |
logging.error(f"Dataset not compatible with pandas library") | |
return gr.File(visible=False), gr.Row.update(visible=False) | |
first_config_loading_code = pandas_library['loading_codes'][0] | |
first_code = first_config_loading_code['code'] | |
first_config = first_config_loading_code['config_name'] | |
first_split = list(first_config_loading_code['arguments']['splits'].keys())[0] | |
logging.info(f"First config: {first_config} - first split: {first_split}") | |
first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}" | |
logging.info(f"First split file: {first_file}") | |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>" | |
features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3) | |
prompt = generate_eda_prompt(features, df, first_code) | |
logging.info(f"Prompt: {prompt}") | |
commands = get_notebook_cells(prompt) | |
# TODO: Generate this commands using InferenceClient | |
# commands = [ | |
# "!pip install pandas", | |
# "import pandas as pd" | |
# f"df = pd.read_parquet('{first_file}')", | |
# "df.head()", | |
# f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))', | |
# "print(df.shape)", | |
# "df.columns", | |
# "df.describe()", | |
# "df.info()", | |
# # TODO: Generate more commands according to column types for EDA and then for auto training? | |
# ] | |
notebook_name = f"{dataset_id.replace('/', '-')}.ipynb" | |
create_notebook_file(commands, notebook_name=notebook_name) | |
return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True) | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🤖 Dataset notebook creator 🕵️") | |
dataset_name = HuggingfaceHubSearch( | |
label="Hub Dataset ID", | |
placeholder="Search for dataset id on Huggingface", | |
search_type="dataset", | |
value="", | |
) | |
def embed(name): | |
if not name: | |
return gr.Markdown("### No dataset provided") | |
html_code = f""" | |
<iframe | |
src="https://huggingface.co/datasets/{name}/embed/viewer/default/train" | |
frameborder="0" | |
width="100%" | |
height="350px" | |
></iframe> | |
""" | |
return gr.HTML(value=html_code) | |
generate_btn = gr.Button("Generate notebook") | |
download_link = gr.File(label="Download notebook", visible=False) | |
with gr.Row(visible=False) as auth_page: | |
with gr.Column(): | |
gr.Markdown( | |
"Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):" | |
) | |
token_box = gr.Textbox( | |
"", label="token", placeholder="hf_xxx", type="password" | |
) | |
auth_error = gr.Markdown("", visible=False) | |
push_btn = gr.Button("Push notebook to hub", visible=False) | |
output_lbl = gr.HTML(value="", visible=False) | |
generate_btn.click( | |
generate_notebook, | |
inputs=[dataset_name], | |
outputs=[download_link, auth_page], | |
) | |
def auth(token): | |
if not token: | |
return { | |
auth_error: gr.Markdown(value="", visible=False), | |
push_btn: gr.Button(visible=False), | |
} | |
return { | |
auth_error: gr.Markdown(value="", visible=False), | |
push_btn: gr.Button("Push notebook to hub", visible=True), | |
} | |
token_box.change( | |
auth, | |
inputs=token_box, | |
outputs=[auth_error, push_btn], | |
) | |
push_btn.click( | |
push_notebook, | |
inputs=[download_link, dataset_name, token_box], | |
outputs=output_lbl, | |
) | |
demo.launch() | |