Spaces:

asoria
/

auto-dataset-analyst-creator

Sleeping

App Files Files Community

asoria commited on Aug 27, 2024

Commit

ae0bcb8

1 Parent(s): d51b1c2

Adding sample datasets

Browse files

Files changed (2) hide show

app.py +84 -68
utils/prompts.py +11 -9

app.py CHANGED Viewed

@@ -16,6 +16,8 @@ from utils.prompts import (
     generate_eda_system_prompt,
     generate_embedding_system_prompt,
 )
 """
 TODOs:
@@ -36,6 +38,17 @@ TODOs:
 """
 # Configuration
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
@@ -101,7 +114,7 @@ def get_txt_from_output(output):
         return content
     except Exception as e:
         gr.Error("Error when parsing notebook, try again.")
-        logging.error(f"Failed to fetch compatible libraries: {e}")
         raise
@@ -139,74 +152,64 @@ def content_from_output(output):
     return match.group(1)
-def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
     for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
-        yield messages, gr.update(visible=False), None  # Keep button hidden
     yield (
         messages,
-        gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
         f"{dataset_id.replace('/', '-')}-eda.ipynb",
     )
-def generate_rag_cells(dataset_id, profile: gr.OAuthProfile | None):
     for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
-        yield messages, gr.update(visible=False), None  # Keep button hidden
     yield (
         messages,
-        gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
         f"{dataset_id.replace('/', '-')}-rag.ipynb",
     )
-def generate_embedding_cells(dataset_id, profile: gr.OAuthProfile | None):
     for messages in generate_cells(
         dataset_id, generate_embedding_system_prompt, "embedding"
     ):
-        yield messages, gr.update(visible=False), None  # Keep button hidden
     yield (
         messages,
-        gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
         f"{dataset_id.replace('/', '-')}-embedding.ipynb",
     )
-def push_to_hub(
     history,
     dataset_id,
     notebook_file,
-    profile: gr.OAuthProfile | None,
-    oauth_token: gr.OAuthToken | None,
 ):
     logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
-    if not profile or not oauth_token:
-        yield history + [
-            gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
-        ]
-        return
-    logging.info(f"Profile: {profile}, token: {oauth_token.token}")
-    notebook_name = "dataset_analysis.ipynb"
-    api = HfApi(token=oauth_token.token)
     try:
-        logging.info(f"About to push {notebook_file} - {notebook_name} - {dataset_id}")
         api.upload_file(
             path_or_fileobj=notebook_file,
             path_in_repo=notebook_name,
-            repo_id=dataset_id,
             repo_type="dataset",
         )
-        link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
         logging.info(f"Notebook pushed to hub: {link}")
         yield history + [
             gr.ChatMessage(
                 role="user",
-                content=f"[See the notebook on the Hub]({link})",
             )
         ]
     except Exception as e:
         logging.info("Failed to push notebook", e)
         yield history + [gr.ChatMessage(role="assistant", content=e)]
@@ -292,31 +295,50 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
     notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
     messages.append(
-        gr.ChatMessage(role="user", content="Here is the generated notebook file")
-    )
-    yield messages
-    messages.append(
-        gr.ChatMessage(
-            role="user",
-            content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
-        )
     )
     yield messages
 def coming_soon_message():
     return gr.Info("Coming soon")
-with gr.Blocks(fill_height=True) as demo:
     gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
-    with gr.Row():
-        with gr.Column(scale=1):
             dataset_name = HuggingfaceHubSearch(
                 label="Hub Dataset ID",
                 placeholder="Search for dataset id on Huggingface",
                 search_type="dataset",
-                value="",
             )
             @gr.render(inputs=dataset_name)
@@ -334,50 +356,44 @@ with gr.Blocks(fill_height=True) as demo:
                 return gr.HTML(value=html_code)
             with gr.Row():
-                generate_eda_btn = gr.Button("Generate EDA notebook")
-                generate_embedding_btn = gr.Button("Generate Embeddings notebook")
-                generate_rag_btn = gr.Button("Generate RAG notebook")
-                generate_training_btn = gr.Button("Generate Training notebook")
-        with gr.Column():
-            chatbot = gr.Chatbot(
-                label="Results",
-                type="messages",
-                avatar_images=(
-                    None,
-                    None,
-                ),
-            )
             with gr.Row():
-                login_btn = gr.LoginButton()
-                push_btn = gr.Button("Push to hub", visible=False)
     notebook_file = gr.File(visible=False)
     generate_eda_btn.click(
         generate_eda_cells,
         inputs=[dataset_name],
-        outputs=[chatbot, push_btn, notebook_file],
     )
-    generate_rag_btn.click(
-        generate_rag_cells,
         inputs=[dataset_name],
-        outputs=[chatbot, push_btn, notebook_file],
     )
-    generate_embedding_btn.click(
-        generate_embedding_cells,
         inputs=[dataset_name],
-        outputs=[chatbot, push_btn, notebook_file],
     )
     generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
-    push_btn.click(
-        push_to_hub,
-        inputs=[
-            chatbot,
-            dataset_name,
-            notebook_file,
-        ],
-        outputs=[chatbot],
-    )
 demo.launch()

     generate_eda_system_prompt,
     generate_embedding_system_prompt,
 )
+from dotenv import load_dotenv
+import os
 """
 TODOs:
 """
 # Configuration
+load_dotenv()
+HF_TOKEN = os.getenv("HF_TOKEN")
+NOTEBOOKS_REPOSITORY = os.getenv("NOTEBOOKS_REPOSITORY")
+assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
+assert (
+    NOTEBOOKS_REPOSITORY is not None
+), "You need to set NOTEBOOKS_REPOSITORY in your environment variables"
 BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
 HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
         return content
     except Exception as e:
         gr.Error("Error when parsing notebook, try again.")
+        logging.error(f"Failed to parse code: {e}")
         raise
     return match.group(1)
+def generate_eda_cells(dataset_id):
     for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
+        yield messages, None  # Keep button hidden
     yield (
         messages,
         f"{dataset_id.replace('/', '-')}-eda.ipynb",
     )
+def generate_rag_cells(dataset_id):
     for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
+        yield messages, None  # Keep button hidden
     yield (
         messages,
         f"{dataset_id.replace('/', '-')}-rag.ipynb",
     )
+def generate_embedding_cells(dataset_id):
     for messages in generate_cells(
         dataset_id, generate_embedding_system_prompt, "embedding"
     ):
+        yield messages, None  # Keep button hidden
     yield (
         messages,
         f"{dataset_id.replace('/', '-')}-embedding.ipynb",
     )
+def _push_to_hub(
     history,
     dataset_id,
     notebook_file,
 ):
     logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
+    notebook_name = notebook_file.split("/")[-1]
+    api = HfApi(token=HF_TOKEN)
     try:
+        logging.info(f"About to push {notebook_file} - {dataset_id}")
         api.upload_file(
             path_or_fileobj=notebook_file,
             path_in_repo=notebook_name,
+            repo_id=NOTEBOOKS_REPOSITORY,
             repo_type="dataset",
         )
+        link = f"https://huggingface.co/datasets/{NOTEBOOKS_REPOSITORY}/blob/main/{notebook_name}"
         logging.info(f"Notebook pushed to hub: {link}")
         yield history + [
             gr.ChatMessage(
                 role="user",
+                content=f"[{notebook_name}]({link})",
             )
         ]
     except Exception as e:
         logging.info("Failed to push notebook", e)
         yield history + [gr.ChatMessage(role="assistant", content=e)]
     notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
     create_notebook_file(commands, notebook_name=notebook_name)
     messages.append(
+        gr.ChatMessage(role="user", content="See the generated notebook on the Hub")
     )
     yield messages
+    yield from _push_to_hub(messages, dataset_id, notebook_name)
 def coming_soon_message():
     return gr.Info("Coming soon")
+def handle_example(example, button_action):
+    return button_action(example)
+with gr.Blocks(fill_width=True) as demo:
     gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
+    with gr.Row(equal_height=True):
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(label="Suggested notebook type", visible=False)
             dataset_name = HuggingfaceHubSearch(
                 label="Hub Dataset ID",
                 placeholder="Search for dataset id on Huggingface",
                 search_type="dataset",
+                value="jamescalam/world-cities-geo",
+            )
+            dataset_samples = gr.Examples(
+                examples=[
+                    [
+                        "infinite-dataset-hub/WorldPopCounts",
+                        "Try this dataset for Exploratory Data Analysis",
+                    ],
+                    [
+                        "infinite-dataset-hub/GlobaleCuisineRecipes",
+                        "Try this dataset for Embeddings generation",
+                    ],
+                    [
+                        "infinite-dataset-hub/GlobalBestSellersSummaries",
+                        "Try this dataset for RAG generation",
+                    ],
+                ],
+                inputs=[dataset_name, text_input],
+                cache_examples=False,
             )
             @gr.render(inputs=dataset_name)
                 return gr.HTML(value=html_code)
             with gr.Row():
+                generate_eda_btn = gr.Button("Exploratory Data Analysis")
+                generate_embedding_btn = gr.Button("Data Embeddings")
+                generate_rag_btn = gr.Button("RAG")
+                generate_training_btn = gr.Button(
+                    "Training - Coming soon", interactive=False
+                )
+        with gr.Column(scale=1):
             with gr.Row():
+                chatbot = gr.Chatbot(
+                    label="Results",
+                    type="messages",
+                    height=650,
+                    avatar_images=(
+                        None,
+                        None,
+                    ),
+                )
     notebook_file = gr.File(visible=False)
     generate_eda_btn.click(
         generate_eda_cells,
         inputs=[dataset_name],
+        outputs=[chatbot, notebook_file],
     )
+    generate_embedding_btn.click(
+        generate_embedding_cells,
         inputs=[dataset_name],
+        outputs=[chatbot, notebook_file],
     )
+    generate_rag_btn.click(
+        generate_rag_cells,
         inputs=[dataset_name],
+        outputs=[chatbot, notebook_file],
     )
     generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
 demo.launch()

utils/prompts.py CHANGED Viewed

@@ -6,8 +6,9 @@ def generate_mapping_prompt(code):
     """Format the following python code to a list of cells to be used in a jupyter notebook:
     {{ code }}
-    The output should be a list of json objects with the
-    following schema, including the leading and trailing "```json" and "```":
     ```json
     [
@@ -42,7 +43,7 @@ def generate_eda_system_prompt():
     You create Exploratory Data Analysis jupyter notebooks with the following content:
     1. Install an import libraries
-    2. Load the dataset
     3. Understand the dataset
     4. Check for missing values
     5. Identify the data types of each column
@@ -70,12 +71,12 @@ def generate_eda_system_prompt():
 @outlines.prompt
 def generate_embedding_system_prompt():
     """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
-    You can use only the following libraries: Pandas for data manipulation, sentence-transformers to load the embedding model and FAISS to create the index.
     You create a jupyter notebooks with the following content:
-    1. Install libraries
     2. Import libraries
-    3. Load dataset as dataframe
     4. Choose column to be used for the embeddings
     5. Remove duplicate data
     6. Load column as a list
@@ -103,12 +104,13 @@ def generate_embedding_system_prompt():
 def generate_rag_system_prompt():
     """You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
     The data is provided as a pandas DataFrame with the following structure:
     You create Exploratory RAG jupyter notebooks with the following content:
     1. Install libraries
     2. Import libraries
-    3. Load dataset as dataframe
     4. Choose column to be used for the embeddings
     5. Remove duplicate data
     6. Load column as a list
@@ -116,8 +118,8 @@ def generate_rag_system_prompt():
     8. Create FAISS index
     9. Ask a query sample and encode it
     10. Search similar documents based on the query sample and the FAISS index
-    11. Load HuggingFaceH4/zephyr-7b-beta model from transformers library and create a pipeline
-    12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar docuemnts and a 'user' part with the query
     13. Send the prompt to the pipeline and show answer
     Ensure the notebook is well-organized, with explanations for each step.

     """Format the following python code to a list of cells to be used in a jupyter notebook:
     {{ code }}
+    ## Instruction
+    Before returning the result, evaluate if the json object is well formatted, if not, fix it.
+    The output should be a list of json objects with the following schema, including the leading and trailing "```json" and "```":
     ```json
     [
     You create Exploratory Data Analysis jupyter notebooks with the following content:
     1. Install an import libraries
+    2. Load dataset as dataframe using the provided loading data code snippet
     3. Understand the dataset
     4. Check for missing values
     5. Identify the data types of each column
 @outlines.prompt
 def generate_embedding_system_prompt():
     """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
+    You must use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model and 'faiss-cpu' to create the index.
     You create a jupyter notebooks with the following content:
+    1. Install libraries as !pip install
     2. Import libraries
+    3. Load dataset as dataframe using the provided loading data code snippet
     4. Choose column to be used for the embeddings
     5. Remove duplicate data
     6. Load column as a list
 def generate_rag_system_prompt():
     """You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
     The data is provided as a pandas DataFrame with the following structure:
+    You can use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index and 'transformers' for inference.
     You create Exploratory RAG jupyter notebooks with the following content:
     1. Install libraries
     2. Import libraries
+    3. Load dataset as dataframe using the provided loading data code snippet
     4. Choose column to be used for the embeddings
     5. Remove duplicate data
     6. Load column as a list
     8. Create FAISS index
     9. Ask a query sample and encode it
     10. Search similar documents based on the query sample and the FAISS index
+    11. Load 'HuggingFaceH4/zephyr-7b-beta model' from transformers library and create a pipeline
+    12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar documents and a 'user' part with the query
     13. Send the prompt to the pipeline and show answer
     Ensure the notebook is well-organized, with explanations for each step.