asoria HF staff commited on
Commit
fb98b30
·
1 Parent(s): 299235d

Small details

Browse files
Files changed (2) hide show
  1. app.py +35 -26
  2. utils/prompts.py +78 -73
app.py CHANGED
@@ -107,15 +107,11 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
107
 
108
 
109
  def get_txt_from_output(output):
110
- try:
111
- extracted_text = extract_content_from_output(output)
112
- content = json.loads(extracted_text)
113
- logging.info(content)
114
- return content
115
- except Exception as e:
116
- gr.Error("Error when parsing notebook, try again.")
117
- logging.error(f"Failed to parse code: {e}")
118
- raise
119
 
120
 
121
  def extract_content_from_output(output):
@@ -266,22 +262,35 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
266
  yield messages
267
  yield messages
268
 
269
- logging.info("---> Formated prompt")
270
- formatted_prompt = generate_mapping_prompt(generated_text)
271
- logging.info(formatted_prompt)
272
- prompt_messages = [{"role": "user", "content": formatted_prompt}]
273
- yield messages + [
274
- gr.ChatMessage(role="assistant", content="⏳ _Generating notebook..._")
275
- ]
276
-
277
- output = inference_client.chat_completion(
278
- messages=prompt_messages, stream=False, max_tokens=2500
279
- )
280
- cells_txt = output.choices[0].message.content
281
- logging.info("---> Model output")
282
- logging.info(cells_txt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
- commands = get_txt_from_output(cells_txt)
285
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
286
 
287
  commands.insert(
@@ -319,7 +328,7 @@ with gr.Blocks(fill_width=True) as demo:
319
  label="Hub Dataset ID",
320
  placeholder="Search for dataset id on Huggingface",
321
  search_type="dataset",
322
- value="jamescalam/world-cities-geo",
323
  )
324
 
325
  dataset_samples = gr.Examples(
@@ -357,7 +366,7 @@ with gr.Blocks(fill_width=True) as demo:
357
 
358
  with gr.Row():
359
  generate_eda_btn = gr.Button("Exploratory Data Analysis")
360
- generate_embedding_btn = gr.Button("Data Embeddings")
361
  generate_rag_btn = gr.Button("RAG")
362
  generate_training_btn = gr.Button(
363
  "Training - Coming soon", interactive=False
 
107
 
108
 
109
  def get_txt_from_output(output):
110
+ extracted_text = extract_content_from_output(output)
111
+ logging.info("--> Extracted text between json block")
112
+ logging.info(extracted_text)
113
+ content = json.loads(extracted_text)
114
+ return content
 
 
 
 
115
 
116
 
117
  def extract_content_from_output(output):
 
262
  yield messages
263
  yield messages
264
 
265
+ logging.info("---> Notebook markdown code output")
266
+ logging.info(generated_text)
267
+
268
+ retries = 0
269
+ retry_limit = 3
270
+ while retries < retry_limit:
271
+ try:
272
+ formatted_prompt = generate_mapping_prompt(generated_text)
273
+ prompt_messages = [{"role": "user", "content": formatted_prompt}]
274
+ yield messages + [
275
+ gr.ChatMessage(role="assistant", content="⏳ _Generating notebook..._")
276
+ ]
277
+
278
+ output = inference_client.chat_completion(
279
+ messages=prompt_messages, stream=False, max_tokens=2500
280
+ )
281
+ cells_txt = output.choices[0].message.content
282
+ logging.info(f"---> Mapping to json output attempt {retries}")
283
+ logging.info(cells_txt)
284
+ commands = get_txt_from_output(cells_txt)
285
+ break
286
+ except Exception as e:
287
+ logging.warn("Error when parsing output, retrying ..")
288
+ retries += 1
289
+ if retries == retry_limit:
290
+ logging.error(f"Unable to parse output after {retry_limit} retries")
291
+ gr.Error("Unable to generate notebook. Try again please")
292
+ raise e
293
 
 
294
  html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
295
 
296
  commands.insert(
 
328
  label="Hub Dataset ID",
329
  placeholder="Search for dataset id on Huggingface",
330
  search_type="dataset",
331
+ value="",
332
  )
333
 
334
  dataset_samples = gr.Examples(
 
366
 
367
  with gr.Row():
368
  generate_eda_btn = gr.Button("Exploratory Data Analysis")
369
+ generate_embedding_btn = gr.Button("Embeddings")
370
  generate_rag_btn = gr.Button("RAG")
371
  generate_training_btn = gr.Button(
372
  "Training - Coming soon", interactive=False
utils/prompts.py CHANGED
@@ -3,21 +3,22 @@ import outlines
3
 
4
  @outlines.prompt
5
  def generate_mapping_prompt(code):
6
- """Format the following python code to a list of cells to be used in a jupyter notebook:
7
- {{ code }}
8
-
9
- ## Instruction
10
- Before returning the result, evaluate if the json object is well formatted, if not, fix it.
11
- The output should be a list of json objects with the following schema, including the leading and trailing "```json" and "```":
12
 
 
13
  ```json
14
  [
15
  {
16
- "cell_type": string // This refers either is a markdown or code cell type.
17
- "source": list of string separated by comma // This is the list of text or python code.
18
  }
19
  ]
20
  ```
 
 
 
21
  """
22
 
23
 
@@ -37,26 +38,27 @@ def generate_user_prompt(columns_info, sample_data, first_code):
37
 
38
  @outlines.prompt
39
  def generate_eda_system_prompt():
40
- """You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook.
41
- You can use only the following libraries: Pandas for data manipulation, Matplotlib and Seaborn for visualisations, make sure to add them as part of the notebook for installation.
42
-
43
- You create Exploratory Data Analysis jupyter notebooks with the following content:
44
-
45
- 1. Install an import libraries
46
- 2. Load dataset as dataframe using the provided loading data code snippet
47
- 3. Understand the dataset
48
- 4. Check for missing values
49
- 5. Identify the data types of each column
50
- 6. Identify duplicated rows
51
- 7. Generate descriptive statistics
52
- 8. Visualize the distribution of each column
53
- 9. Visualize the relationship between columns
54
- 10. Correlation analysis
55
- 11. Any additional relevant visualizations or analyses you deem appropriate.
56
-
57
- Ensure the notebook is well-organized, with explanations for each step.
58
- The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
59
- The user will provide you information about the dataset in the following format:
 
60
 
61
  ## Columns and Data Types
62
 
@@ -64,30 +66,32 @@ def generate_eda_system_prompt():
64
 
65
  ## Loading Data code
66
 
67
- It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
68
  """
69
 
70
 
71
  @outlines.prompt
72
  def generate_embedding_system_prompt():
73
- """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
74
- You must use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model and 'faiss-cpu' to create the index.
75
- You create a jupyter notebooks with the following content:
76
-
77
- 1. Install libraries as !pip install
78
- 2. Import libraries
79
- 3. Load dataset as dataframe using the provided loading data code snippet
80
- 4. Choose column to be used for the embeddings
81
- 5. Remove duplicate data
82
- 6. Load column as a list
83
- 7. Load sentence-transformers model
84
- 8. Create FAISS index
85
- 9. Ask a query sample and encode it
86
- 10. Search similar documents based on the query sample and the FAISS index
87
-
88
- Ensure the notebook is well-organized, with explanations for each step.
89
- The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
90
- The user will provide you information about the dataset in the following format:
 
 
91
 
92
  ## Columns and Data Types
93
 
@@ -95,36 +99,37 @@ def generate_embedding_system_prompt():
95
 
96
  ## Loading Data code
97
 
98
- It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
99
-
100
  """
101
 
102
 
103
  @outlines.prompt
104
  def generate_rag_system_prompt():
105
- """You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
106
- The data is provided as a pandas DataFrame with the following structure:
107
- You can use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index and 'transformers' for inference.
108
-
109
- You create Exploratory RAG jupyter notebooks with the following content:
110
-
111
- 1. Install libraries
112
- 2. Import libraries
113
- 3. Load dataset as dataframe using the provided loading data code snippet
114
- 4. Choose column to be used for the embeddings
115
- 5. Remove duplicate data
116
- 6. Load column as a list
117
- 7. Load sentence-transformers model
118
- 8. Create FAISS index
119
- 9. Ask a query sample and encode it
120
- 10. Search similar documents based on the query sample and the FAISS index
121
- 11. Load 'HuggingFaceH4/zephyr-7b-beta model' from transformers library and create a pipeline
122
- 12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar documents and a 'user' part with the query
123
- 13. Send the prompt to the pipeline and show answer
124
-
125
- Ensure the notebook is well-organized, with explanations for each step.
126
- The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
127
- The user will provide you information about the dataset in the following format:
 
 
128
 
129
  ## Columns and Data Types
130
 
@@ -132,5 +137,5 @@ def generate_rag_system_prompt():
132
 
133
  ## Loading Data code
134
 
135
- It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
136
  """
 
3
 
4
  @outlines.prompt
5
  def generate_mapping_prompt(code):
6
+ """Convert the provided Python code into a list of cells formatted for a Jupyter notebook.
7
+ Ensure that the JSON objects are correctly formatted; if they are not, correct them.
8
+ Do not include an extra comma at the end of the final list element.
 
 
 
9
 
10
+ The output should be a list of JSON objects with the following format:
11
  ```json
12
  [
13
  {
14
+ "cell_type": "string", // Specify "markdown" or "code".
15
+ "source": ["string1", "string2"] // List of text or code strings.
16
  }
17
  ]
18
  ```
19
+
20
+ ## Code
21
+ {{ code }}
22
  """
23
 
24
 
 
38
 
39
  @outlines.prompt
40
  def generate_eda_system_prompt():
41
+ """You are an expert data analyst tasked with creating an Exploratory Data Analysis (EDA) Jupyter notebook.
42
+ Use only the following libraries: Pandas for data manipulation, Matplotlib and Seaborn for visualizations. Ensure these libraries are installed as part of the notebook.
43
+
44
+ The EDA notebook should include:
45
+
46
+ 1. Install and import necessary libraries.
47
+ 2. Load the dataset as a DataFrame using the provided code.
48
+ 3. Understand the dataset structure.
49
+ 4. Check for missing values.
50
+ 5. Identify data types of each column.
51
+ 6. Detect duplicated rows.
52
+ 7. Generate descriptive statistics.
53
+ 8. Visualize the distribution of each column.
54
+ 9. Explore relationships between columns.
55
+ 10. Perform correlation analysis.
56
+ 11. Include any additional relevant visualizations or analyses.
57
+
58
+ Ensure the notebook is well-organized with clear explanations for each step.
59
+ The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
60
+
61
+ The user will provide the dataset information in the following format:
62
 
63
  ## Columns and Data Types
64
 
 
66
 
67
  ## Loading Data code
68
 
69
+ Use the provided code to load the dataset; do not use any other method.
70
  """
71
 
72
 
73
  @outlines.prompt
74
  def generate_embedding_system_prompt():
75
+ """You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
76
+ Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
77
+
78
+ The notebook should include:
79
+
80
+ 1. Install necessary libraries with !pip install.
81
+ 2. Import libraries.
82
+ 3. Load the dataset as a DataFrame using the provided code.
83
+ 4. Select the column to generate embeddings.
84
+ 5. Remove duplicate data.
85
+ 6. Convert the selected column to a list.
86
+ 7. Load the sentence-transformers model.
87
+ 8. Create a FAISS index.
88
+ 9. Encode a query sample.
89
+ 10. Search for similar documents using the FAISS index.
90
+
91
+ Ensure the notebook is well-organized with explanations for each step.
92
+ The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
93
+
94
+ The user will provide dataset information in the following format:
95
 
96
  ## Columns and Data Types
97
 
 
99
 
100
  ## Loading Data code
101
 
102
+ Use the provided code to load the dataset; do not use any other method.
 
103
  """
104
 
105
 
106
  @outlines.prompt
107
  def generate_rag_system_prompt():
108
+ """You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
109
+ The dataset is provided as a pandas DataFrame.
110
+
111
+ Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
112
+
113
+ The RAG notebook should include:
114
+
115
+ 1. Install necessary libraries.
116
+ 2. Import libraries.
117
+ 3. Load the dataset as a DataFrame using the provided code.
118
+ 4. Select the column for generating embeddings.
119
+ 5. Remove duplicate data.
120
+ 6. Convert the selected column to a list.
121
+ 7. Load the sentence-transformers model.
122
+ 8. Create a FAISS index.
123
+ 9. Encode a query sample.
124
+ 10. Search for similar documents using the FAISS index.
125
+ 11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
126
+ 12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
127
+ 13. Send the prompt to the pipeline and display the answer.
128
+
129
+ Ensure the notebook is well-organized with explanations for each step.
130
+ The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
131
+
132
+ The user will provide the dataset information in the following format:
133
 
134
  ## Columns and Data Types
135
 
 
137
 
138
  ## Loading Data code
139
 
140
+ Use the provided code to load the dataset; do not use any other method.
141
  """