Spaces:

asoria
/

auto-dataset-analyst-creator

Sleeping

App Files Files Community

asoria commited on Aug 27, 2024

Commit

fb98b30

1 Parent(s): 299235d

Small details

Browse files

Files changed (2) hide show

app.py +35 -26
utils/prompts.py +78 -73

app.py CHANGED Viewed

@@ -107,15 +107,11 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
 def get_txt_from_output(output):
-    try:
-        extracted_text = extract_content_from_output(output)
-        content = json.loads(extracted_text)
-        logging.info(content)
-        return content
-    except Exception as e:
-        gr.Error("Error when parsing notebook, try again.")
-        logging.error(f"Failed to parse code: {e}")
-        raise
 def extract_content_from_output(output):
@@ -266,22 +262,35 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
         yield messages
     yield messages
-    logging.info("---> Formated prompt")
-    formatted_prompt = generate_mapping_prompt(generated_text)
-    logging.info(formatted_prompt)
-    prompt_messages = [{"role": "user", "content": formatted_prompt}]
-    yield messages + [
-        gr.ChatMessage(role="assistant", content="⏳ _Generating notebook..._")
-    ]
-    output = inference_client.chat_completion(
-        messages=prompt_messages, stream=False, max_tokens=2500
-    )
-    cells_txt = output.choices[0].message.content
-    logging.info("---> Model output")
-    logging.info(cells_txt)
-    commands = get_txt_from_output(cells_txt)
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
     commands.insert(
@@ -319,7 +328,7 @@ with gr.Blocks(fill_width=True) as demo:
                 label="Hub Dataset ID",
                 placeholder="Search for dataset id on Huggingface",
                 search_type="dataset",
-                value="jamescalam/world-cities-geo",
             )
             dataset_samples = gr.Examples(
@@ -357,7 +366,7 @@ with gr.Blocks(fill_width=True) as demo:
             with gr.Row():
                 generate_eda_btn = gr.Button("Exploratory Data Analysis")
-                generate_embedding_btn = gr.Button("Data Embeddings")
                 generate_rag_btn = gr.Button("RAG")
                 generate_training_btn = gr.Button(
                     "Training - Coming soon", interactive=False

 def get_txt_from_output(output):
+    extracted_text = extract_content_from_output(output)
+    logging.info("--> Extracted text between json block")
+    logging.info(extracted_text)
+    content = json.loads(extracted_text)
+    return content
 def extract_content_from_output(output):
         yield messages
     yield messages
+    logging.info("---> Notebook markdown code output")
+    logging.info(generated_text)
+    retries = 0
+    retry_limit = 3
+    while retries < retry_limit:
+        try:
+            formatted_prompt = generate_mapping_prompt(generated_text)
+            prompt_messages = [{"role": "user", "content": formatted_prompt}]
+            yield messages + [
+                gr.ChatMessage(role="assistant", content="⏳ _Generating notebook..._")
+            ]
+            output = inference_client.chat_completion(
+                messages=prompt_messages, stream=False, max_tokens=2500
+            )
+            cells_txt = output.choices[0].message.content
+            logging.info(f"---> Mapping to json output attempt {retries}")
+            logging.info(cells_txt)
+            commands = get_txt_from_output(cells_txt)
+            break
+        except Exception as e:
+            logging.warn("Error when parsing output, retrying ..")
+            retries += 1
+            if retries == retry_limit:
+                logging.error(f"Unable to parse output after {retry_limit} retries")
+                gr.Error("Unable to generate notebook. Try again please")
+                raise e
     html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
     commands.insert(
                 label="Hub Dataset ID",
                 placeholder="Search for dataset id on Huggingface",
                 search_type="dataset",
+                value="",
             )
             dataset_samples = gr.Examples(
             with gr.Row():
                 generate_eda_btn = gr.Button("Exploratory Data Analysis")
+                generate_embedding_btn = gr.Button("Embeddings")
                 generate_rag_btn = gr.Button("RAG")
                 generate_training_btn = gr.Button(
                     "Training - Coming soon", interactive=False

utils/prompts.py CHANGED Viewed

@@ -3,21 +3,22 @@ import outlines
 @outlines.prompt
 def generate_mapping_prompt(code):
-    """Format the following python code to a list of cells to be used in a jupyter notebook:
-    {{ code }}
-    ## Instruction
-    Before returning the result, evaluate if the json object is well formatted, if not, fix it.
-    The output should be a list of json objects with the following schema, including the leading and trailing "```json" and "```":
     ```json
     [
         {
-            "cell_type": string  // This refers either is a markdown or code cell type.
-            "source": list of string separated by comma // This is the list of text or python code.
         }
     ]
     ```
     """
@@ -37,26 +38,27 @@ def generate_user_prompt(columns_info, sample_data, first_code):
 @outlines.prompt
 def generate_eda_system_prompt():
-    """You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook.
-    You can use only the following libraries: Pandas for data manipulation, Matplotlib and Seaborn for visualisations, make sure to add them as part of the notebook for installation.
-    You create Exploratory Data Analysis jupyter notebooks with the following content:
-    1. Install an import libraries
-    2. Load dataset as dataframe using the provided loading data code snippet
-    3. Understand the dataset
-    4. Check for missing values
-    5. Identify the data types of each column
-    6. Identify duplicated rows
-    7. Generate descriptive statistics
-    8. Visualize the distribution of each column
-    9. Visualize the relationship between columns
-    10. Correlation analysis
-    11. Any additional relevant visualizations or analyses you deem appropriate.
-    Ensure the notebook is well-organized, with explanations for each step.
-    The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
-    The user will provide you information about the dataset in the following format:
     ## Columns and Data Types
@@ -64,30 +66,32 @@ def generate_eda_system_prompt():
     ## Loading Data code
-    It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
     """
 @outlines.prompt
 def generate_embedding_system_prompt():
-    """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
-    You must use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model and 'faiss-cpu' to create the index.
-    You create a jupyter notebooks with the following content:
-    1. Install libraries as !pip install
-    2. Import libraries
-    3. Load dataset as dataframe using the provided loading data code snippet
-    4. Choose column to be used for the embeddings
-    5. Remove duplicate data
-    6. Load column as a list
-    7. Load sentence-transformers model
-    8. Create FAISS index
-    9. Ask a query sample and encode it
-    10. Search similar documents based on the query sample and the FAISS index
-    Ensure the notebook is well-organized, with explanations for each step.
-    The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
-    The user will provide you information about the dataset in the following format:
     ## Columns and Data Types
@@ -95,36 +99,37 @@ def generate_embedding_system_prompt():
     ## Loading Data code
-    It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
     """
 @outlines.prompt
 def generate_rag_system_prompt():
-    """You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
-    The data is provided as a pandas DataFrame with the following structure:
-    You can use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index and 'transformers' for inference.
-    You create Exploratory RAG jupyter notebooks with the following content:
-    1. Install libraries
-    2. Import libraries
-    3. Load dataset as dataframe using the provided loading data code snippet
-    4. Choose column to be used for the embeddings
-    5. Remove duplicate data
-    6. Load column as a list
-    7. Load sentence-transformers model
-    8. Create FAISS index
-    9. Ask a query sample and encode it
-    10. Search similar documents based on the query sample and the FAISS index
-    11. Load 'HuggingFaceH4/zephyr-7b-beta model' from transformers library and create a pipeline
-    12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar documents and a 'user' part with the query
-    13. Send the prompt to the pipeline and show answer
-    Ensure the notebook is well-organized, with explanations for each step.
-    The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
-    The user will provide you information about the dataset in the following format:
     ## Columns and Data Types
@@ -132,5 +137,5 @@ def generate_rag_system_prompt():
     ## Loading Data code
-    It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
     """

 @outlines.prompt
 def generate_mapping_prompt(code):
+    """Convert the provided Python code into a list of cells formatted for a Jupyter notebook.
+    Ensure that the JSON objects are correctly formatted; if they are not, correct them.
+    Do not include an extra comma at the end of the final list element.
+    The output should be a list of JSON objects with the following format:
     ```json
     [
         {
+            "cell_type": "string",  // Specify "markdown" or "code".
+            "source": ["string1", "string2"]  // List of text or code strings.
         }
     ]
     ```
+    ## Code
+    {{ code }}
     """
 @outlines.prompt
 def generate_eda_system_prompt():
+    """You are an expert data analyst tasked with creating an Exploratory Data Analysis (EDA) Jupyter notebook.
+    Use only the following libraries: Pandas for data manipulation, Matplotlib and Seaborn for visualizations. Ensure these libraries are installed as part of the notebook.
+    The EDA notebook should include:
+    1. Install and import necessary libraries.
+    2. Load the dataset as a DataFrame using the provided code.
+    3. Understand the dataset structure.
+    4. Check for missing values.
+    5. Identify data types of each column.
+    6. Detect duplicated rows.
+    7. Generate descriptive statistics.
+    8. Visualize the distribution of each column.
+    9. Explore relationships between columns.
+    10. Perform correlation analysis.
+    11. Include any additional relevant visualizations or analyses.
+    Ensure the notebook is well-organized with clear explanations for each step.
+    The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
+    The user will provide the dataset information in the following format:
     ## Columns and Data Types
     ## Loading Data code
+    Use the provided code to load the dataset; do not use any other method.
     """
 @outlines.prompt
 def generate_embedding_system_prompt():
+    """You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
+    Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
+    The notebook should include:
+    1. Install necessary libraries with !pip install.
+    2. Import libraries.
+    3. Load the dataset as a DataFrame using the provided code.
+    4. Select the column to generate embeddings.
+    5. Remove duplicate data.
+    6. Convert the selected column to a list.
+    7. Load the sentence-transformers model.
+    8. Create a FAISS index.
+    9. Encode a query sample.
+    10. Search for similar documents using the FAISS index.
+    Ensure the notebook is well-organized with explanations for each step.
+    The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
+    The user will provide dataset information in the following format:
     ## Columns and Data Types
     ## Loading Data code
+    Use the provided code to load the dataset; do not use any other method.
     """
 @outlines.prompt
 def generate_rag_system_prompt():
+    """You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
+    The dataset is provided as a pandas DataFrame.
+    Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
+    The RAG notebook should include:
+    1. Install necessary libraries.
+    2. Import libraries.
+    3. Load the dataset as a DataFrame using the provided code.
+    4. Select the column for generating embeddings.
+    5. Remove duplicate data.
+    6. Convert the selected column to a list.
+    7. Load the sentence-transformers model.
+    8. Create a FAISS index.
+    9. Encode a query sample.
+    10. Search for similar documents using the FAISS index.
+    11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
+    12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
+    13. Send the prompt to the pipeline and display the answer.
+    Ensure the notebook is well-organized with explanations for each step.
+    The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
+    The user will provide the dataset information in the following format:
     ## Columns and Data Types
     ## Loading Data code
+    Use the provided code to load the dataset; do not use any other method.
     """