Spaces:

asoria
/

auto-dataset-analyst-creator

Sleeping

App Files Files Community

asoria HF staff commited on Aug 23, 2024

Commit

806dbf3

1 Parent(s): 45f97ba

Add Rag basic prompt

Browse files

Files changed (2) hide show

app.py +34 -5
utils/prompts.py +75 -23

app.py CHANGED Viewed

@@ -11,8 +11,10 @@ import pandas as pd
 from gradio.data_classes import FileData
 from utils.prompts import (
     generate_mapping_prompt,
-    generate_eda_prompt,
     generate_embedding_prompt,
 )
 """
@@ -58,7 +60,11 @@ def get_compatible_libraries(dataset: str):
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
     nb["cells"] = [
-        nbf.v4.new_code_cell(cmd["source"])
         if cmd["cell_type"] == "code"
         else nbf.v4.new_markdown_cell(cmd["source"])
         for cmd in cell_commands
@@ -134,7 +140,7 @@ def content_from_output(output):
 def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
-    for messages in generate_cells(dataset_id, generate_eda_prompt, "eda"):
         yield messages, gr.update(visible=False), None  # Keep button hidden
     yield (
@@ -144,6 +150,17 @@ def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
     )
 def generate_embedding_cells(dataset_id, profile: gr.OAuthProfile | None):
     for messages in generate_cells(dataset_id, generate_embedding_prompt, "embedding"):
         yield messages, gr.update(visible=False), None  # Keep button hidden
@@ -219,11 +236,16 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
     first_config = first_config_loading_code["config_name"]
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
-    prompt = prompt_fn(features, df.head(5).to_dict(orient="records"), first_code)
     messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
-    prompt_messages = [{"role": "user", "content": prompt}]
     output = inference_client.chat_completion(
         messages=prompt_messages, stream=True, max_tokens=2500
     )
@@ -312,6 +334,7 @@ with gr.Blocks(fill_height=True) as demo:
             with gr.Row():
                 generate_eda_btn = gr.Button("Generate EDA notebook")
                 generate_embedding_btn = gr.Button("Generate Embeddings notebook")
                 generate_training_btn = gr.Button("Generate Training notebook")
         with gr.Column():
             chatbot = gr.Chatbot(
@@ -332,6 +355,12 @@ with gr.Blocks(fill_height=True) as demo:
         outputs=[chatbot, push_btn, notebook_file],
     )
     generate_embedding_btn.click(
         generate_embedding_cells,
         inputs=[dataset_name],

 from gradio.data_classes import FileData
 from utils.prompts import (
     generate_mapping_prompt,
     generate_embedding_prompt,
+    generate_user_prompt,
+    generate_rag_system_prompt,
+    generate_eda_system_prompt,
 )
 """
 def create_notebook_file(cell_commands, notebook_name):
     nb = nbf.v4.new_notebook()
     nb["cells"] = [
+        nbf.v4.new_code_cell(
+            cmd["source"]
+            if isinstance(cmd["source"], str)
+            else "\n".join(cmd["source"])
+        )
         if cmd["cell_type"] == "code"
         else nbf.v4.new_markdown_cell(cmd["source"])
         for cmd in cell_commands
 def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
+    for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
         yield messages, gr.update(visible=False), None  # Keep button hidden
     yield (
     )
+def generate_rag_cells(dataset_id, profile: gr.OAuthProfile | None):
+    for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
+        yield messages, gr.update(visible=False), None  # Keep button hidden
+    yield (
+        messages,
+        gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
+        f"{dataset_id.replace('/', '-')}-rag.ipynb",
+    )
 def generate_embedding_cells(dataset_id, profile: gr.OAuthProfile | None):
     for messages in generate_cells(dataset_id, generate_embedding_prompt, "embedding"):
         yield messages, gr.update(visible=False), None  # Keep button hidden
     first_config = first_config_loading_code["config_name"]
     first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
     features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
+    prompt = generate_user_prompt(
+        features, df.head(5).to_dict(orient="records"), first_code
+    )
     messages = [gr.ChatMessage(role="user", content=prompt)]
     yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
+    prompt_messages = [
+        {"role": "system", "content": prompt_fn()},
+        {"role": "user", "content": prompt},
+    ]
     output = inference_client.chat_completion(
         messages=prompt_messages, stream=True, max_tokens=2500
     )
             with gr.Row():
                 generate_eda_btn = gr.Button("Generate EDA notebook")
                 generate_embedding_btn = gr.Button("Generate Embeddings notebook")
+                generate_rag_btn = gr.Button("Generate RAG notebook")
                 generate_training_btn = gr.Button("Generate Training notebook")
         with gr.Column():
             chatbot = gr.Chatbot(
         outputs=[chatbot, push_btn, notebook_file],
     )
+    generate_rag_btn.click(
+        generate_rag_cells,
+        inputs=[dataset_name],
+        outputs=[chatbot, push_btn, notebook_file],
+    )
     generate_embedding_btn.click(
         generate_embedding_cells,
         inputs=[dataset_name],

utils/prompts.py CHANGED Viewed

@@ -21,37 +21,55 @@ def generate_mapping_prompt(code):
 @outlines.prompt
-def generate_eda_prompt(columns_info, sample_data, first_code):
-    """You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
-    Columns and Data Types:
     {{ columns_info }}
-    Sample Data:
     {{ sample_data }}
-    Please create a pandas EDA notebook that includes the following:
-    1. Summary statistics for numerical columns.
-    2. Distribution plots for numerical columns.
-    3. Bar plots or count plots for categorical columns.
-    4. Correlation matrix and heatmap for numerical columns.
-    5. Any additional relevant visualizations or analyses you deem appropriate.
     Ensure the notebook is well-organized, with explanations for each step.
-    It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
-    {{ first_code }}
-    The output should be a markdown python code snippet between the leading and trailing "```python" and "```".
     """
 @outlines.prompt
-def generate_embedding_prompt(columns_info, sample_data, first_code):
-    """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings from a dataset.
     The data is provided as a pandas DataFrame with the following structure:
     Columns and Data Types:
@@ -60,24 +78,58 @@ def generate_embedding_prompt(columns_info, sample_data, first_code):
     Sample Data:
     {{ sample_data }}
-    Please create a notebook that includes the following:
     1. Load the dataset
     2. Load embedding model using sentence-transformers library
     3. Convert data into embeddings
     4. Store embeddings
     Ensure the notebook is well-organized, with explanations for each step.
-    It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
-    {{ first_code }}
     """
 @outlines.prompt
-def generate_training_prompt(columns_info, sample_data, first_code):
-    """
-    TODO
     """

 @outlines.prompt
+def generate_user_prompt(columns_info, sample_data, first_code):
+    """
+    ## Columns and Data Types
     {{ columns_info }}
+    ## Sample Data
     {{ sample_data }}
+    ## Loading Data code
+    {{ first_code }}
+    """
+@outlines.prompt
+def generate_eda_system_prompt():
+    """You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook.
+    You can use only the following libraries: Pandas for data manipulation, Matplotlib and Seaborn for visualisations, make sure to add them as part of the notebook for installation.
+    You create Exploratory Data Analysis jupyter notebooks with the following content:
+    1. Install an import libraries
+    2. Load the dataset
+    3. Understand the dataset
+    4. Check for missing values
+    5. Identify the data types of each column
+    6. Identify duplicated rows
+    7. Generate descriptive statistics
+    8. Visualize the distribution of each column
+    9. Visualize the relationship between columns
+    10. Correlation analysis
+    11. Any additional relevant visualizations or analyses you deem appropriate.
     Ensure the notebook is well-organized, with explanations for each step.
+    The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
+    The user will provide you information about the dataset in the following format:
+    ## Columns and Data Types
+    ## Sample Data
+    ## Loading Data code
+    It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
     """
 @outlines.prompt
+def generate_embedding_system_prompt():
+    """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
     The data is provided as a pandas DataFrame with the following structure:
     Columns and Data Types:
     Sample Data:
     {{ sample_data }}
+    Please create a notebook that includes the following steps:
     1. Load the dataset
     2. Load embedding model using sentence-transformers library
     3. Convert data into embeddings
     4. Store embeddings
     Ensure the notebook is well-organized, with explanations for each step.
+    The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
+    The user will provide you information about the dataset in the following format:
+    ## Columns and Data Types
+    ## Sample Data
+    ## Loading Data code
+    It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
     """
 @outlines.prompt
+def generate_rag_system_prompt():
+    """You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
+    The data is provided as a pandas DataFrame with the following structure:
+    You create Exploratory RAG jupyter notebooks with the following content:
+    1. Install libraries
+    2. Import libraries
+    3. Load dataset as dataframe
+    4. Choose column to be used for the embeddings
+    5. Remove duplicate data
+    6. Load column as a list
+    7. Load sentence-transformers model
+    8. Create FAISS index
+    9. Ask a query sample and encode it
+    10. Search similar documents based on the query sample and the FAISS index
+    11. Load HuggingFaceH4/zephyr-7b-beta model from transformers library and create a pipeline
+    12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar docuemnts and a 'user' part with the query
+    13. Send the prompt to the pipeline and show answer
+    Ensure the notebook is well-organized, with explanations for each step.
+    The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
+    The user will provide you information about the dataset in the following format:
+    ## Columns and Data Types
+    ## Sample Data
+    ## Loading Data code
+    It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
     """