Spaces:
Sleeping
Sleeping
Small details
Browse files- app.py +35 -26
- utils/prompts.py +78 -73
app.py
CHANGED
@@ -107,15 +107,11 @@ def get_first_rows_as_df(dataset: str, config: str, split: str, limit: int):
|
|
107 |
|
108 |
|
109 |
def get_txt_from_output(output):
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
except Exception as e:
|
116 |
-
gr.Error("Error when parsing notebook, try again.")
|
117 |
-
logging.error(f"Failed to parse code: {e}")
|
118 |
-
raise
|
119 |
|
120 |
|
121 |
def extract_content_from_output(output):
|
@@ -266,22 +262,35 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
|
|
266 |
yield messages
|
267 |
yield messages
|
268 |
|
269 |
-
logging.info("--->
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
|
284 |
-
commands = get_txt_from_output(cells_txt)
|
285 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
286 |
|
287 |
commands.insert(
|
@@ -319,7 +328,7 @@ with gr.Blocks(fill_width=True) as demo:
|
|
319 |
label="Hub Dataset ID",
|
320 |
placeholder="Search for dataset id on Huggingface",
|
321 |
search_type="dataset",
|
322 |
-
value="
|
323 |
)
|
324 |
|
325 |
dataset_samples = gr.Examples(
|
@@ -357,7 +366,7 @@ with gr.Blocks(fill_width=True) as demo:
|
|
357 |
|
358 |
with gr.Row():
|
359 |
generate_eda_btn = gr.Button("Exploratory Data Analysis")
|
360 |
-
generate_embedding_btn = gr.Button("
|
361 |
generate_rag_btn = gr.Button("RAG")
|
362 |
generate_training_btn = gr.Button(
|
363 |
"Training - Coming soon", interactive=False
|
|
|
107 |
|
108 |
|
109 |
def get_txt_from_output(output):
|
110 |
+
extracted_text = extract_content_from_output(output)
|
111 |
+
logging.info("--> Extracted text between json block")
|
112 |
+
logging.info(extracted_text)
|
113 |
+
content = json.loads(extracted_text)
|
114 |
+
return content
|
|
|
|
|
|
|
|
|
115 |
|
116 |
|
117 |
def extract_content_from_output(output):
|
|
|
262 |
yield messages
|
263 |
yield messages
|
264 |
|
265 |
+
logging.info("---> Notebook markdown code output")
|
266 |
+
logging.info(generated_text)
|
267 |
+
|
268 |
+
retries = 0
|
269 |
+
retry_limit = 3
|
270 |
+
while retries < retry_limit:
|
271 |
+
try:
|
272 |
+
formatted_prompt = generate_mapping_prompt(generated_text)
|
273 |
+
prompt_messages = [{"role": "user", "content": formatted_prompt}]
|
274 |
+
yield messages + [
|
275 |
+
gr.ChatMessage(role="assistant", content="⏳ _Generating notebook..._")
|
276 |
+
]
|
277 |
+
|
278 |
+
output = inference_client.chat_completion(
|
279 |
+
messages=prompt_messages, stream=False, max_tokens=2500
|
280 |
+
)
|
281 |
+
cells_txt = output.choices[0].message.content
|
282 |
+
logging.info(f"---> Mapping to json output attempt {retries}")
|
283 |
+
logging.info(cells_txt)
|
284 |
+
commands = get_txt_from_output(cells_txt)
|
285 |
+
break
|
286 |
+
except Exception as e:
|
287 |
+
logging.warn("Error when parsing output, retrying ..")
|
288 |
+
retries += 1
|
289 |
+
if retries == retry_limit:
|
290 |
+
logging.error(f"Unable to parse output after {retry_limit} retries")
|
291 |
+
gr.Error("Unable to generate notebook. Try again please")
|
292 |
+
raise e
|
293 |
|
|
|
294 |
html_code = f"<iframe src='https://huggingface.co/datasets/{dataset_id}/embed/viewer' width='80%' height='560px'></iframe>"
|
295 |
|
296 |
commands.insert(
|
|
|
328 |
label="Hub Dataset ID",
|
329 |
placeholder="Search for dataset id on Huggingface",
|
330 |
search_type="dataset",
|
331 |
+
value="",
|
332 |
)
|
333 |
|
334 |
dataset_samples = gr.Examples(
|
|
|
366 |
|
367 |
with gr.Row():
|
368 |
generate_eda_btn = gr.Button("Exploratory Data Analysis")
|
369 |
+
generate_embedding_btn = gr.Button("Embeddings")
|
370 |
generate_rag_btn = gr.Button("RAG")
|
371 |
generate_training_btn = gr.Button(
|
372 |
"Training - Coming soon", interactive=False
|
utils/prompts.py
CHANGED
@@ -3,21 +3,22 @@ import outlines
|
|
3 |
|
4 |
@outlines.prompt
|
5 |
def generate_mapping_prompt(code):
|
6 |
-
"""
|
7 |
-
|
8 |
-
|
9 |
-
## Instruction
|
10 |
-
Before returning the result, evaluate if the json object is well formatted, if not, fix it.
|
11 |
-
The output should be a list of json objects with the following schema, including the leading and trailing "```json" and "```":
|
12 |
|
|
|
13 |
```json
|
14 |
[
|
15 |
{
|
16 |
-
"cell_type": string //
|
17 |
-
"source":
|
18 |
}
|
19 |
]
|
20 |
```
|
|
|
|
|
|
|
21 |
"""
|
22 |
|
23 |
|
@@ -37,26 +38,27 @@ def generate_user_prompt(columns_info, sample_data, first_code):
|
|
37 |
|
38 |
@outlines.prompt
|
39 |
def generate_eda_system_prompt():
|
40 |
-
"""You are an expert data analyst tasked with
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
1. Install
|
46 |
-
2. Load dataset as
|
47 |
-
3. Understand the dataset
|
48 |
-
4. Check for missing values
|
49 |
-
5. Identify
|
50 |
-
6.
|
51 |
-
7. Generate descriptive statistics
|
52 |
-
8. Visualize the distribution of each column
|
53 |
-
9.
|
54 |
-
10.
|
55 |
-
11.
|
56 |
-
|
57 |
-
Ensure the notebook is well-organized
|
58 |
-
The output should be
|
59 |
-
|
|
|
60 |
|
61 |
## Columns and Data Types
|
62 |
|
@@ -64,30 +66,32 @@ def generate_eda_system_prompt():
|
|
64 |
|
65 |
## Loading Data code
|
66 |
|
67 |
-
|
68 |
"""
|
69 |
|
70 |
|
71 |
@outlines.prompt
|
72 |
def generate_embedding_system_prompt():
|
73 |
-
"""You are an expert data scientist tasked with
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
The
|
|
|
|
|
91 |
|
92 |
## Columns and Data Types
|
93 |
|
@@ -95,36 +99,37 @@ def generate_embedding_system_prompt():
|
|
95 |
|
96 |
## Loading Data code
|
97 |
|
98 |
-
|
99 |
-
|
100 |
"""
|
101 |
|
102 |
|
103 |
@outlines.prompt
|
104 |
def generate_rag_system_prompt():
|
105 |
-
"""You are an expert machine learning engineer tasked with
|
106 |
-
The
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
The
|
|
|
|
|
128 |
|
129 |
## Columns and Data Types
|
130 |
|
@@ -132,5 +137,5 @@ def generate_rag_system_prompt():
|
|
132 |
|
133 |
## Loading Data code
|
134 |
|
135 |
-
|
136 |
"""
|
|
|
3 |
|
4 |
@outlines.prompt
|
5 |
def generate_mapping_prompt(code):
|
6 |
+
"""Convert the provided Python code into a list of cells formatted for a Jupyter notebook.
|
7 |
+
Ensure that the JSON objects are correctly formatted; if they are not, correct them.
|
8 |
+
Do not include an extra comma at the end of the final list element.
|
|
|
|
|
|
|
9 |
|
10 |
+
The output should be a list of JSON objects with the following format:
|
11 |
```json
|
12 |
[
|
13 |
{
|
14 |
+
"cell_type": "string", // Specify "markdown" or "code".
|
15 |
+
"source": ["string1", "string2"] // List of text or code strings.
|
16 |
}
|
17 |
]
|
18 |
```
|
19 |
+
|
20 |
+
## Code
|
21 |
+
{{ code }}
|
22 |
"""
|
23 |
|
24 |
|
|
|
38 |
|
39 |
@outlines.prompt
|
40 |
def generate_eda_system_prompt():
|
41 |
+
"""You are an expert data analyst tasked with creating an Exploratory Data Analysis (EDA) Jupyter notebook.
|
42 |
+
Use only the following libraries: Pandas for data manipulation, Matplotlib and Seaborn for visualizations. Ensure these libraries are installed as part of the notebook.
|
43 |
+
|
44 |
+
The EDA notebook should include:
|
45 |
+
|
46 |
+
1. Install and import necessary libraries.
|
47 |
+
2. Load the dataset as a DataFrame using the provided code.
|
48 |
+
3. Understand the dataset structure.
|
49 |
+
4. Check for missing values.
|
50 |
+
5. Identify data types of each column.
|
51 |
+
6. Detect duplicated rows.
|
52 |
+
7. Generate descriptive statistics.
|
53 |
+
8. Visualize the distribution of each column.
|
54 |
+
9. Explore relationships between columns.
|
55 |
+
10. Perform correlation analysis.
|
56 |
+
11. Include any additional relevant visualizations or analyses.
|
57 |
+
|
58 |
+
Ensure the notebook is well-organized with clear explanations for each step.
|
59 |
+
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
|
60 |
+
|
61 |
+
The user will provide the dataset information in the following format:
|
62 |
|
63 |
## Columns and Data Types
|
64 |
|
|
|
66 |
|
67 |
## Loading Data code
|
68 |
|
69 |
+
Use the provided code to load the dataset; do not use any other method.
|
70 |
"""
|
71 |
|
72 |
|
73 |
@outlines.prompt
|
74 |
def generate_embedding_system_prompt():
|
75 |
+
"""You are an expert data scientist tasked with creating a Jupyter notebook to generate embeddings for a specific dataset.
|
76 |
+
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, and 'faiss-cpu' to create the index.
|
77 |
+
|
78 |
+
The notebook should include:
|
79 |
+
|
80 |
+
1. Install necessary libraries with !pip install.
|
81 |
+
2. Import libraries.
|
82 |
+
3. Load the dataset as a DataFrame using the provided code.
|
83 |
+
4. Select the column to generate embeddings.
|
84 |
+
5. Remove duplicate data.
|
85 |
+
6. Convert the selected column to a list.
|
86 |
+
7. Load the sentence-transformers model.
|
87 |
+
8. Create a FAISS index.
|
88 |
+
9. Encode a query sample.
|
89 |
+
10. Search for similar documents using the FAISS index.
|
90 |
+
|
91 |
+
Ensure the notebook is well-organized with explanations for each step.
|
92 |
+
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
|
93 |
+
|
94 |
+
The user will provide dataset information in the following format:
|
95 |
|
96 |
## Columns and Data Types
|
97 |
|
|
|
99 |
|
100 |
## Loading Data code
|
101 |
|
102 |
+
Use the provided code to load the dataset; do not use any other method.
|
|
|
103 |
"""
|
104 |
|
105 |
|
106 |
@outlines.prompt
|
107 |
def generate_rag_system_prompt():
|
108 |
+
"""You are an expert machine learning engineer tasked with creating a Jupyter notebook to demonstrate a Retrieval-Augmented Generation (RAG) system using a specific dataset.
|
109 |
+
The dataset is provided as a pandas DataFrame.
|
110 |
+
|
111 |
+
Use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index, and 'transformers' for inference.
|
112 |
+
|
113 |
+
The RAG notebook should include:
|
114 |
+
|
115 |
+
1. Install necessary libraries.
|
116 |
+
2. Import libraries.
|
117 |
+
3. Load the dataset as a DataFrame using the provided code.
|
118 |
+
4. Select the column for generating embeddings.
|
119 |
+
5. Remove duplicate data.
|
120 |
+
6. Convert the selected column to a list.
|
121 |
+
7. Load the sentence-transformers model.
|
122 |
+
8. Create a FAISS index.
|
123 |
+
9. Encode a query sample.
|
124 |
+
10. Search for similar documents using the FAISS index.
|
125 |
+
11. Load the 'HuggingFaceH4/zephyr-7b-beta' model from the transformers library and create a pipeline.
|
126 |
+
12. Create a prompt with two parts: 'system' for instructions based on a 'context' from the retrieved documents, and 'user' for the query.
|
127 |
+
13. Send the prompt to the pipeline and display the answer.
|
128 |
+
|
129 |
+
Ensure the notebook is well-organized with explanations for each step.
|
130 |
+
The output should be Markdown content with Python code snippets enclosed in "```python" and "```".
|
131 |
+
|
132 |
+
The user will provide the dataset information in the following format:
|
133 |
|
134 |
## Columns and Data Types
|
135 |
|
|
|
137 |
|
138 |
## Loading Data code
|
139 |
|
140 |
+
Use the provided code to load the dataset; do not use any other method.
|
141 |
"""
|