import gradio as gr from gradio_huggingfacehub_search import HuggingfaceHubSearch import nbformat as nbf from huggingface_hub import HfApi from httpx import Client import logging from huggingface_hub import InferenceClient import json import re """ TODOs: - Refactor - Make the notebook generation more dynamic, add loading components to do not freeze the UI - Fix errors: - When generating output - When parsing output - When pushing notebook - Parametrize the commands (Move to another file) - Use an LLM to suggest commands by column types - Add target tasks to choose for the notebook: - Exploratory data analysis - Auto training - RAG - etc. - Enable 'generate notebook' button only if dataset is available and supports library - First get compatible-libraries and let user choose the library """ # Configuration BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co" HEADERS = {"Accept": "application/json", "Content-Type": "application/json"} client = Client(headers=HEADERS) inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") logging.basicConfig(level=logging.INFO) def get_compatible_libraries(dataset: str): resp = client.get( f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}" ) resp.raise_for_status() return resp.json() import pandas as pd def generate_eda_prompt(columns_info, df, first_code): # columns_info = df.dtypes.to_dict() sample_data = df.head(5).to_dict(orient='records') # prompt = ( # "You are an expert data analyst tasked with generating an exploratory data analysis (EDA) jupyter notebook. " # "The data is provided as a pandas DataFrame with the following structure:\n\n" # f"Columns and Data Types:\n{columns_info}\n\n" # f"Sample Data:\n{sample_data}\n\n" # "Please create a pandas EDA notebook that includes the following:\n" # "1. Summary statistics for numerical columns.\n" # "2. Distribution plots for numerical columns.\n" # "3. Bar plots or count plots for categorical columns.\n" # "4. Correlation matrix and heatmap for numerical columns.\n" # "5. Any other relevant visualizations or analyses you deem appropriate.\n\n" # "Ensure the notebook is well-organized, with explanations for each step." # f"You can use the following code to load the dataset:\n\n{first_code}\n" # """The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n" # ```json # [ # { # "cell_type": string // This refers either is a markdown or code cell type. # "source": list of string // This is the list of text or python code. # } # ] # ``` # Do not include more information than necessary, as this will be used to generate the notebook. # """ # ) format_instructions = """ The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```": ```json [ { "cell_type": string // This refers either is a markdown or code cell type. "source": list of string // This is the list of text or python code. } ] ``` """ prompt = """ You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure: Columns and Data Types: {columns_info} Sample Data: {sample_data} Please create a pandas EDA notebook that includes the following: 1. Summary statistics for numerical columns. 2. Distribution plots for numerical columns. 3. Bar plots or count plots for categorical columns. 4. Correlation matrix and heatmap for numerical columns. 5. Any additional relevant visualizations or analyses you deem appropriate. Ensure the notebook is well-organized, with explanations for each step. It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way: {first_code} {format_instructions} """ return prompt.format(columns_info=columns_info, sample_data=sample_data, first_code=first_code, format_instructions=format_instructions) def create_notebook_file(cell_commands, notebook_name): nb = nbf.v4.new_notebook() nb["cells"] = [nbf.v4.new_code_cell(command['source']) if command['cell_type'] == 'code' else nbf.v4.new_markdown_cell(command['source']) for command in cell_commands] with open(notebook_name, "w") as f: nbf.write(nb, f) logging.info(f"Notebook {notebook_name} created successfully") def push_notebook(file_path, dataset_id, token): notebook_name = "dataset_analysis.ipynb" api = HfApi(token=token) try: api.upload_file( path_or_fileobj=file_path, path_in_repo=notebook_name, repo_id=dataset_id, repo_type="dataset", ) link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}" return gr.HTML( value=f'See notebook', visible=True, ) except Exception as err: logging.error(f"Failed to push notebook: {err}") return gr.HTML(value="Failed to push notebook", visible=True) def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int): resp = client.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}") resp.raise_for_status() content = resp.json() rows = content["rows"] rows = [row['row'] for row in rows] first_rows_df = pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit) features = content['features'] features_dict = {feature['name']: feature['type'] for feature in features} return features_dict, first_rows_df def content_from_output(output): pattern = r'`json(.*?)`' logging.info("--------> Getting data from output") match = re.search(pattern, output, re.DOTALL) if not match: pattern = r'```(.*?)```' logging.info("--------> Getting data from output, second try") match = re.search(pattern, output, re.DOTALL) if not match: raise Exception("Unable to generate jupyter notebook.") extracted_text = match.group(1) logging.info(extracted_text) def get_notebook_cells(prompt): messages = [{"role": "user", "content": prompt}] output = inference_client.chat_completion(messages=messages, max_tokens=2500) output = (output.choices[0].message.content) logging.info(output) pattern = r'`json(.*?)`' logging.info("--------> Getting data from output") match = re.search(pattern, output, re.DOTALL) if not match: raise Exception("Unable to generate jupyter notebook.") extracted_text = match.group(1) logging.info(extracted_text) content = json.loads(extracted_text) logging.info(content) return content def generate_notebook(dataset_id): #TODO: Load dataframe from notebook here # generate_eda_prompt try: libraries = get_compatible_libraries(dataset_id) except Exception as err: gr.Error('Unable to retrieve dataset info from HF Hub.') logging.error(f"Failed to fetch compatible libraries: {err}") return None if not libraries: gr.Warning('Dataset not compatible with pandas library.') logging.error(f"Dataset not compatible with pandas library") return gr.File(visible=False), gr.Row.update(visible=False) pandas_library = next( (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"), None, ) if not pandas_library: gr.Warning('Dataset not compatible with pandas library.') logging.error(f"Dataset not compatible with pandas library") return gr.File(visible=False), gr.Row.update(visible=False) first_config_loading_code = pandas_library['loading_codes'][0] first_code = first_config_loading_code['code'] first_config = first_config_loading_code['config_name'] first_split = list(first_config_loading_code['arguments']['splits'].keys())[0] logging.info(f"First config: {first_config} - first split: {first_split}") first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}" logging.info(f"First split file: {first_file}") html_code = f"" features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3) prompt = generate_eda_prompt(features, df, first_code) logging.info(f"Prompt: {prompt}") commands = get_notebook_cells(prompt) # TODO: Generate this commands using InferenceClient # commands = [ # "!pip install pandas", # "import pandas as pd" # f"df = pd.read_parquet('{first_file}')", # "df.head()", # f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))', # "print(df.shape)", # "df.columns", # "df.describe()", # "df.info()", # # TODO: Generate more commands according to column types for EDA and then for auto training? # ] notebook_name = f"{dataset_id.replace('/', '-')}.ipynb" create_notebook_file(commands, notebook_name=notebook_name) return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True) with gr.Blocks() as demo: gr.Markdown("# 🤖 Dataset notebook creator 🕵️") dataset_name = HuggingfaceHubSearch( label="Hub Dataset ID", placeholder="Search for dataset id on Huggingface", search_type="dataset", value="", ) @gr.render(inputs=dataset_name) def embed(name): if not name: return gr.Markdown("### No dataset provided") html_code = f""" """ return gr.HTML(value=html_code) generate_btn = gr.Button("Generate notebook") download_link = gr.File(label="Download notebook", visible=False) with gr.Row(visible=False) as auth_page: with gr.Column(): gr.Markdown( "Want to push to hub? Enter your token ([settings](https://huggingface.co/settings/tokens)):" ) token_box = gr.Textbox( "", label="token", placeholder="hf_xxx", type="password" ) auth_error = gr.Markdown("", visible=False) push_btn = gr.Button("Push notebook to hub", visible=False) output_lbl = gr.HTML(value="", visible=False) generate_btn.click( generate_notebook, inputs=[dataset_name], outputs=[download_link, auth_page], ) def auth(token): if not token: return { auth_error: gr.Markdown(value="", visible=False), push_btn: gr.Button(visible=False), } return { auth_error: gr.Markdown(value="", visible=False), push_btn: gr.Button("Push notebook to hub", visible=True), } token_box.change( auth, inputs=token_box, outputs=[auth_error, push_btn], ) push_btn.click( push_notebook, inputs=[download_link, dataset_name, token_box], outputs=output_lbl, ) demo.launch()