Spaces:

asoria
/

auto-dataset-analyst-creator

Sleeping

App Files Files Community

asoria HF staff commited on Aug 12, 2024

Commit

810f00f

1 Parent(s): f327376

Try to generate commands from InferenceClient call

Browse files

Files changed (2) hide show

README.md +1 -1
app.py +169 -25

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Auto Dataset Analyst Creator
 emoji: 🏢
 colorFrom: gray
 colorTo: indigo

 ---
+title: Dataset automatic notebook creator
 emoji: 🏢
 colorFrom: gray
 colorTo: indigo

app.py CHANGED Viewed

@@ -4,41 +4,120 @@ import nbformat as nbf
 from huggingface_hub import HfApi
 from httpx import Client
 import logging
 """
 TODOs:
-- Add more commands to the notebook
 - Parametrize the commands (Move to another file)
-- Let user choose the framework and get if from /compatible-libraries
 - Use an LLM to suggest commands by column types
-- Add commands for auto training
 - Enable 'generate notebook' button only if dataset is available and supports library
 """
 # Configuration
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
 client = Client(headers=HEADERS)
 logging.basicConfig(level=logging.INFO)
 def get_compatible_libraries(dataset: str):
-    try:
         resp = client.get(
             f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
         )
         resp.raise_for_status()
         return resp.json()
-    except Exception as err:
-        logging.error(f"Failed to fetch compatible libraries: {err}")
-        return None
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
-    nb["cells"] = [nbf.v4.new_code_cell(command) for command in cell_commands]
     with open(notebook_name, "w") as f:
         nbf.write(nb, f)
@@ -64,35 +143,100 @@ def push_notebook(file_path, dataset_id, token):
         logging.error(f"Failed to push notebook: {err}")
         return gr.HTML(value="Failed to push notebook", visible=True)
 def generate_notebook(dataset_id):
-    first_code = f"import pandas as pd\n\ndf = pd.read_parquet('hf://datasets/{dataset_id}/data/train-00000-of-00001.parquet')"
-    libraries = get_compatible_libraries(dataset_id)
     if not libraries:
         return gr.File(visible=False), gr.Row.update(visible=False)
     pandas_library = next(
         (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
         None,
     )
-    if pandas_library:
-        first_code = pandas_library["loading_codes"][0]["code"]
-    else:
         return gr.File(visible=False), gr.Row.update(visible=False)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
-    commands = [
-        "!pip install pandas",
-        first_code,
-        "df.head()",
-        f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
-        "print(df.shape)",
-        "df.columns",
-        "df.describe()",
-        "df.info()",
-        # TODO: Generate more commands according to column types for EDA and then for auto training?
-    ]
     notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
     return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)

 from huggingface_hub import HfApi
 from httpx import Client
 import logging
+from huggingface_hub import InferenceClient
+import json
+import re
 """
 TODOs:
+- Refactor
+- Make the notebook generation more dynamic, add loading components to do not freeze the UI
+- Fix errors:
+    - When generating output
+    - When parsing output
+    - When pushing notebook
 - Parametrize the commands (Move to another file)
 - Use an LLM to suggest commands by column types
+- Add target tasks to choose for the notebook:
+    - Exploratory data analysis
+    - Auto training
+    - RAG
+    - etc.
 - Enable 'generate notebook' button only if dataset is available and supports library
+- First get compatible-libraries and let user choose the library
 """
 # Configuration
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
 client = Client(headers=HEADERS)
+inference_client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct")
 logging.basicConfig(level=logging.INFO)
 def get_compatible_libraries(dataset: str):
         resp = client.get(
             f"{BASE_DATASETS_SERVER_URL}/compatible-libraries?dataset={dataset}"
         )
         resp.raise_for_status()
         return resp.json()
+import pandas as pd
+def generate_eda_prompt(columns_info, df, first_code):
+    # columns_info = df.dtypes.to_dict()
+    sample_data = df.head(5).to_dict(orient='records')
+    # prompt = (
+    #     "You are an expert data analyst tasked with generating an exploratory data analysis (EDA) jupyter notebook. "
+    #     "The data is provided as a pandas DataFrame with the following structure:\n\n"
+    #     f"Columns and Data Types:\n{columns_info}\n\n"
+    #     f"Sample Data:\n{sample_data}\n\n"
+    #     "Please create a pandas EDA notebook that includes the following:\n"
+    #     "1. Summary statistics for numerical columns.\n"
+    #     "2. Distribution plots for numerical columns.\n"
+    #     "3. Bar plots or count plots for categorical columns.\n"
+    #     "4. Correlation matrix and heatmap for numerical columns.\n"
+    #     "5. Any other relevant visualizations or analyses you deem appropriate.\n\n"
+    #     "Ensure the notebook is well-organized, with explanations for each step."
+    #     f"You can use the following code to load the dataset:\n\n{first_code}\n"
+    #     """The output should be a markdown code snippet formatted in the following schema, including the leading and trailing "```json" and "```":\n"
+    #     ```json
+    #     [
+    #     {
+    #         "cell_type": string  // This refers either is a markdown or code cell type.
+    #         "source": list of string  // This is the list of text or python code.
+    #     }
+    #     ]
+    #     ```
+    #     Do not include more information than necessary, as this will be used to generate the notebook.
+    #     """
+    # )
+    format_instructions = """
+The output should be a markdown code snippet formatted in the
+following schema, including the leading and trailing "```json" and "```":
+```json
+[
+    {
+        "cell_type": string  // This refers either is a markdown or code cell type.
+        "source": list of string  // This is the list of text or python code.
+    }
+]
+```
+"""
+    prompt = """
+You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
+Columns and Data Types:
+{columns_info}
+Sample Data:
+{sample_data}
+Please create a pandas EDA notebook that includes the following:
+1. Summary statistics for numerical columns.
+2. Distribution plots for numerical columns.
+3. Bar plots or count plots for categorical columns.
+4. Correlation matrix and heatmap for numerical columns.
+5. Any additional relevant visualizations or analyses you deem appropriate.
+Ensure the notebook is well-organized, with explanations for each step.
+It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
+{first_code}
+{format_instructions}
+"""
+    return prompt.format(columns_info=columns_info, sample_data=sample_data, first_code=first_code, format_instructions=format_instructions)
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
+    nb["cells"] = [nbf.v4.new_code_cell(command['source']) if command['cell_type'] == 'code' else nbf.v4.new_markdown_cell(command['source']) for command in cell_commands]
     with open(notebook_name, "w") as f:
         nbf.write(nb, f)
         logging.error(f"Failed to push notebook: {err}")
         return gr.HTML(value="Failed to push notebook", visible=True)
+def get_first_rows_as_df(dataset: str, config: str, split: str, limit:int):
+    resp = client.get(f"{BASE_DATASETS_SERVER_URL}/first-rows?dataset={dataset}&config={config}&split={split}")
+    resp.raise_for_status()
+    content = resp.json()
+    rows = content["rows"]
+    rows = [row['row'] for row in rows]
+    first_rows_df = pd.DataFrame.from_dict(rows).sample(frac = 1).head(limit)
+    features = content['features']
+    features_dict = {feature['name']: feature['type'] for feature in features}
+    return features_dict, first_rows_df
+def content_from_output(output):
+    pattern = r'`json(.*?)`'
+    logging.info("--------> Getting data from output")
+    match = re.search(pattern, output, re.DOTALL)
+    if not match:
+        pattern = r'```(.*?)```'
+        logging.info("--------> Getting data from output, second try")
+        match = re.search(pattern, output, re.DOTALL)
+        if  not match:
+            raise Exception("Unable to generate jupyter notebook.")
+    extracted_text = match.group(1)
+    logging.info(extracted_text)
+def get_notebook_cells(prompt):
+    messages = [{"role": "user", "content": prompt}]
+    output = inference_client.chat_completion(messages=messages, max_tokens=2500)
+    output = (output.choices[0].message.content)
+    logging.info(output)
+    pattern = r'`json(.*?)`'
+    logging.info("--------> Getting data from output")
+    match = re.search(pattern, output, re.DOTALL)
+    if not match:
+        raise Exception("Unable to generate jupyter notebook.")
+    extracted_text = match.group(1)
+    logging.info(extracted_text)
+    content = json.loads(extracted_text)
+    logging.info(content)
+    return content
 def generate_notebook(dataset_id):
+    #TODO: Load dataframe from notebook here
+    # generate_eda_prompt
+    try:
+        libraries = get_compatible_libraries(dataset_id)
+    except Exception as err:
+        gr.Error('Unable to retrieve dataset info from HF Hub.')
+        logging.error(f"Failed to fetch compatible libraries: {err}")
+        return None
     if not libraries:
+        gr.Warning('Dataset not compatible with pandas library.')
+        logging.error(f"Dataset not compatible with pandas library")
         return gr.File(visible=False), gr.Row.update(visible=False)
     pandas_library = next(
         (lib for lib in libraries.get("libraries", []) if lib["library"] == "pandas"),
         None,
     )
+    if not pandas_library:
+        gr.Warning('Dataset not compatible with pandas library.')
+        logging.error(f"Dataset not compatible with pandas library")
         return gr.File(visible=False), gr.Row.update(visible=False)
+    first_config_loading_code = pandas_library['loading_codes'][0]
+    first_code = first_config_loading_code['code']
+    first_config = first_config_loading_code['config_name']
+    first_split = list(first_config_loading_code['arguments']['splits'].keys())[0]
+    logging.info(f"First config: {first_config} - first split: {first_split}")
+    first_file = f"hf://datasets/{dataset_id}/{first_config_loading_code['arguments']['splits'][first_split]}"
+    logging.info(f"First split file: {first_file}")
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
+    features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
+    prompt = generate_eda_prompt(features, df, first_code)
+    logging.info(f"Prompt: {prompt}")
+    commands = get_notebook_cells(prompt)
+    # TODO: Generate this commands using InferenceClient
+    # commands = [
+    #     "!pip install pandas",
+    #     "import pandas as pd"
+    #     f"df = pd.read_parquet('{first_file}')",
+    #     "df.head()",
+    #     f'from IPython.display import HTML\n\ndisplay(HTML("{html_code}"))',
+    #     "print(df.shape)",
+    #     "df.columns",
+    #     "df.describe()",
+    #     "df.info()",
+    #     # TODO: Generate more commands according to column types for EDA and then for auto training?
+    # ]
     notebook_name = f"{dataset_id.replace('/', '-')}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
     return gr.File(value=notebook_name, visible=True), gr.Row.update(visible=True)