asoria HF staff commited on
Commit
806dbf3
·
1 Parent(s): 45f97ba

Add Rag basic prompt

Browse files
Files changed (2) hide show
  1. app.py +34 -5
  2. utils/prompts.py +75 -23
app.py CHANGED
@@ -11,8 +11,10 @@ import pandas as pd
11
  from gradio.data_classes import FileData
12
  from utils.prompts import (
13
  generate_mapping_prompt,
14
- generate_eda_prompt,
15
  generate_embedding_prompt,
 
 
 
16
  )
17
 
18
  """
@@ -58,7 +60,11 @@ def get_compatible_libraries(dataset: str):
58
  def create_notebook_file(cell_commands, notebook_name):
59
  nb = nbf.v4.new_notebook()
60
  nb["cells"] = [
61
- nbf.v4.new_code_cell(cmd["source"])
 
 
 
 
62
  if cmd["cell_type"] == "code"
63
  else nbf.v4.new_markdown_cell(cmd["source"])
64
  for cmd in cell_commands
@@ -134,7 +140,7 @@ def content_from_output(output):
134
 
135
 
136
  def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
137
- for messages in generate_cells(dataset_id, generate_eda_prompt, "eda"):
138
  yield messages, gr.update(visible=False), None # Keep button hidden
139
 
140
  yield (
@@ -144,6 +150,17 @@ def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
144
  )
145
 
146
 
 
 
 
 
 
 
 
 
 
 
 
147
  def generate_embedding_cells(dataset_id, profile: gr.OAuthProfile | None):
148
  for messages in generate_cells(dataset_id, generate_embedding_prompt, "embedding"):
149
  yield messages, gr.update(visible=False), None # Keep button hidden
@@ -219,11 +236,16 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
219
  first_config = first_config_loading_code["config_name"]
220
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
221
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
222
- prompt = prompt_fn(features, df.head(5).to_dict(orient="records"), first_code)
 
 
223
  messages = [gr.ChatMessage(role="user", content=prompt)]
224
  yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
225
 
226
- prompt_messages = [{"role": "user", "content": prompt}]
 
 
 
227
  output = inference_client.chat_completion(
228
  messages=prompt_messages, stream=True, max_tokens=2500
229
  )
@@ -312,6 +334,7 @@ with gr.Blocks(fill_height=True) as demo:
312
  with gr.Row():
313
  generate_eda_btn = gr.Button("Generate EDA notebook")
314
  generate_embedding_btn = gr.Button("Generate Embeddings notebook")
 
315
  generate_training_btn = gr.Button("Generate Training notebook")
316
  with gr.Column():
317
  chatbot = gr.Chatbot(
@@ -332,6 +355,12 @@ with gr.Blocks(fill_height=True) as demo:
332
  outputs=[chatbot, push_btn, notebook_file],
333
  )
334
 
 
 
 
 
 
 
335
  generate_embedding_btn.click(
336
  generate_embedding_cells,
337
  inputs=[dataset_name],
 
11
  from gradio.data_classes import FileData
12
  from utils.prompts import (
13
  generate_mapping_prompt,
 
14
  generate_embedding_prompt,
15
+ generate_user_prompt,
16
+ generate_rag_system_prompt,
17
+ generate_eda_system_prompt,
18
  )
19
 
20
  """
 
60
  def create_notebook_file(cell_commands, notebook_name):
61
  nb = nbf.v4.new_notebook()
62
  nb["cells"] = [
63
+ nbf.v4.new_code_cell(
64
+ cmd["source"]
65
+ if isinstance(cmd["source"], str)
66
+ else "\n".join(cmd["source"])
67
+ )
68
  if cmd["cell_type"] == "code"
69
  else nbf.v4.new_markdown_cell(cmd["source"])
70
  for cmd in cell_commands
 
140
 
141
 
142
  def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
143
+ for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
144
  yield messages, gr.update(visible=False), None # Keep button hidden
145
 
146
  yield (
 
150
  )
151
 
152
 
153
+ def generate_rag_cells(dataset_id, profile: gr.OAuthProfile | None):
154
+ for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
155
+ yield messages, gr.update(visible=False), None # Keep button hidden
156
+
157
+ yield (
158
+ messages,
159
+ gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
160
+ f"{dataset_id.replace('/', '-')}-rag.ipynb",
161
+ )
162
+
163
+
164
  def generate_embedding_cells(dataset_id, profile: gr.OAuthProfile | None):
165
  for messages in generate_cells(dataset_id, generate_embedding_prompt, "embedding"):
166
  yield messages, gr.update(visible=False), None # Keep button hidden
 
236
  first_config = first_config_loading_code["config_name"]
237
  first_split = list(first_config_loading_code["arguments"]["splits"].keys())[0]
238
  features, df = get_first_rows_as_df(dataset_id, first_config, first_split, 3)
239
+ prompt = generate_user_prompt(
240
+ features, df.head(5).to_dict(orient="records"), first_code
241
+ )
242
  messages = [gr.ChatMessage(role="user", content=prompt)]
243
  yield messages + [gr.ChatMessage(role="assistant", content="⏳ _Starting task..._")]
244
 
245
+ prompt_messages = [
246
+ {"role": "system", "content": prompt_fn()},
247
+ {"role": "user", "content": prompt},
248
+ ]
249
  output = inference_client.chat_completion(
250
  messages=prompt_messages, stream=True, max_tokens=2500
251
  )
 
334
  with gr.Row():
335
  generate_eda_btn = gr.Button("Generate EDA notebook")
336
  generate_embedding_btn = gr.Button("Generate Embeddings notebook")
337
+ generate_rag_btn = gr.Button("Generate RAG notebook")
338
  generate_training_btn = gr.Button("Generate Training notebook")
339
  with gr.Column():
340
  chatbot = gr.Chatbot(
 
355
  outputs=[chatbot, push_btn, notebook_file],
356
  )
357
 
358
+ generate_rag_btn.click(
359
+ generate_rag_cells,
360
+ inputs=[dataset_name],
361
+ outputs=[chatbot, push_btn, notebook_file],
362
+ )
363
+
364
  generate_embedding_btn.click(
365
  generate_embedding_cells,
366
  inputs=[dataset_name],
utils/prompts.py CHANGED
@@ -21,37 +21,55 @@ def generate_mapping_prompt(code):
21
 
22
 
23
  @outlines.prompt
24
- def generate_eda_prompt(columns_info, sample_data, first_code):
25
- """You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook. The data is provided as a pandas DataFrame with the following structure:
26
-
27
- Columns and Data Types:
28
  {{ columns_info }}
29
 
30
- Sample Data:
31
  {{ sample_data }}
32
 
33
- Please create a pandas EDA notebook that includes the following:
 
 
 
34
 
35
- 1. Summary statistics for numerical columns.
36
- 2. Distribution plots for numerical columns.
37
- 3. Bar plots or count plots for categorical columns.
38
- 4. Correlation matrix and heatmap for numerical columns.
39
- 5. Any additional relevant visualizations or analyses you deem appropriate.
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
  Ensure the notebook is well-organized, with explanations for each step.
 
 
42
 
43
- It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
44
 
45
- {{ first_code }}
46
 
47
- The output should be a markdown python code snippet between the leading and trailing "```python" and "```".
48
 
 
49
  """
50
 
51
 
52
  @outlines.prompt
53
- def generate_embedding_prompt(columns_info, sample_data, first_code):
54
- """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings from a dataset.
55
  The data is provided as a pandas DataFrame with the following structure:
56
 
57
  Columns and Data Types:
@@ -60,24 +78,58 @@ def generate_embedding_prompt(columns_info, sample_data, first_code):
60
  Sample Data:
61
  {{ sample_data }}
62
 
63
- Please create a notebook that includes the following:
64
 
65
  1. Load the dataset
66
  2. Load embedding model using sentence-transformers library
67
  3. Convert data into embeddings
68
  4. Store embeddings
69
-
70
  Ensure the notebook is well-organized, with explanations for each step.
 
 
71
 
72
- It is mandatory that you use the following code to load the dataset, DO NOT try to load the dataset in any other way:
 
 
 
 
 
 
73
 
74
- {{ first_code }}
75
 
76
  """
77
 
78
 
79
  @outlines.prompt
80
- def generate_training_prompt(columns_info, sample_data, first_code):
81
- """
82
- TODO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  """
 
21
 
22
 
23
  @outlines.prompt
24
+ def generate_user_prompt(columns_info, sample_data, first_code):
25
+ """
26
+ ## Columns and Data Types
 
27
  {{ columns_info }}
28
 
29
+ ## Sample Data
30
  {{ sample_data }}
31
 
32
+ ## Loading Data code
33
+ {{ first_code }}
34
+ """
35
+
36
 
37
+ @outlines.prompt
38
+ def generate_eda_system_prompt():
39
+ """You are an expert data analyst tasked with generating an exploratory data analysis (EDA) Jupyter notebook.
40
+ You can use only the following libraries: Pandas for data manipulation, Matplotlib and Seaborn for visualisations, make sure to add them as part of the notebook for installation.
41
+
42
+ You create Exploratory Data Analysis jupyter notebooks with the following content:
43
+
44
+ 1. Install an import libraries
45
+ 2. Load the dataset
46
+ 3. Understand the dataset
47
+ 4. Check for missing values
48
+ 5. Identify the data types of each column
49
+ 6. Identify duplicated rows
50
+ 7. Generate descriptive statistics
51
+ 8. Visualize the distribution of each column
52
+ 9. Visualize the relationship between columns
53
+ 10. Correlation analysis
54
+ 11. Any additional relevant visualizations or analyses you deem appropriate.
55
 
56
  Ensure the notebook is well-organized, with explanations for each step.
57
+ The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
58
+ The user will provide you information about the dataset in the following format:
59
 
60
+ ## Columns and Data Types
61
 
62
+ ## Sample Data
63
 
64
+ ## Loading Data code
65
 
66
+ It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
67
  """
68
 
69
 
70
  @outlines.prompt
71
+ def generate_embedding_system_prompt():
72
+ """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
73
  The data is provided as a pandas DataFrame with the following structure:
74
 
75
  Columns and Data Types:
 
78
  Sample Data:
79
  {{ sample_data }}
80
 
81
+ Please create a notebook that includes the following steps:
82
 
83
  1. Load the dataset
84
  2. Load embedding model using sentence-transformers library
85
  3. Convert data into embeddings
86
  4. Store embeddings
 
87
  Ensure the notebook is well-organized, with explanations for each step.
88
+ The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
89
+ The user will provide you information about the dataset in the following format:
90
 
91
+ ## Columns and Data Types
92
+
93
+ ## Sample Data
94
+
95
+ ## Loading Data code
96
+
97
+ It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
98
 
 
99
 
100
  """
101
 
102
 
103
  @outlines.prompt
104
+ def generate_rag_system_prompt():
105
+ """You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
106
+ The data is provided as a pandas DataFrame with the following structure:
107
+
108
+ You create Exploratory RAG jupyter notebooks with the following content:
109
+
110
+ 1. Install libraries
111
+ 2. Import libraries
112
+ 3. Load dataset as dataframe
113
+ 4. Choose column to be used for the embeddings
114
+ 5. Remove duplicate data
115
+ 6. Load column as a list
116
+ 7. Load sentence-transformers model
117
+ 8. Create FAISS index
118
+ 9. Ask a query sample and encode it
119
+ 10. Search similar documents based on the query sample and the FAISS index
120
+ 11. Load HuggingFaceH4/zephyr-7b-beta model from transformers library and create a pipeline
121
+ 12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar docuemnts and a 'user' part with the query
122
+ 13. Send the prompt to the pipeline and show answer
123
+
124
+ Ensure the notebook is well-organized, with explanations for each step.
125
+ The output should be a markdown content enclosing with "```python" and "```" the python code snippets.
126
+ The user will provide you information about the dataset in the following format:
127
+
128
+ ## Columns and Data Types
129
+
130
+ ## Sample Data
131
+
132
+ ## Loading Data code
133
+
134
+ It is mandatory that you use the provided code to load the dataset, DO NOT try to load the dataset in any other way.
135
  """