Adding sample datasets
Browse files- app.py +84 -68
- utils/prompts.py +11 -9
app.py
CHANGED
@@ -16,6 +16,8 @@ from utils.prompts import (
|
|
16 |
generate_eda_system_prompt,
|
17 |
generate_embedding_system_prompt,
|
18 |
)
|
|
|
|
|
19 |
|
20 |
"""
|
21 |
TODOs:
|
@@ -36,6 +38,17 @@ TODOs:
|
|
36 |
"""
|
37 |
|
38 |
# Configuration
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
|
40 |
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
|
41 |
|
@@ -101,7 +114,7 @@ def get_txt_from_output(output):
|
|
101 |
return content
|
102 |
except Exception as e:
|
103 |
gr.Error("Error when parsing notebook, try again.")
|
104 |
-
logging.error(f"Failed to
|
105 |
raise
|
106 |
|
107 |
|
@@ -139,74 +152,64 @@ def content_from_output(output):
|
|
139 |
return match.group(1)
|
140 |
|
141 |
|
142 |
-
def generate_eda_cells(dataset_id
|
143 |
for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
|
144 |
-
yield messages,
|
145 |
|
146 |
yield (
|
147 |
messages,
|
148 |
-
gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
|
149 |
f"{dataset_id.replace('/', '-')}-eda.ipynb",
|
150 |
)
|
151 |
|
152 |
|
153 |
-
def generate_rag_cells(dataset_id
|
154 |
for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
|
155 |
-
yield messages,
|
156 |
|
157 |
yield (
|
158 |
messages,
|
159 |
-
gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
|
160 |
f"{dataset_id.replace('/', '-')}-rag.ipynb",
|
161 |
)
|
162 |
|
163 |
|
164 |
-
def generate_embedding_cells(dataset_id
|
165 |
for messages in generate_cells(
|
166 |
dataset_id, generate_embedding_system_prompt, "embedding"
|
167 |
):
|
168 |
-
yield messages,
|
169 |
|
170 |
yield (
|
171 |
messages,
|
172 |
-
gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
|
173 |
f"{dataset_id.replace('/', '-')}-embedding.ipynb",
|
174 |
)
|
175 |
|
176 |
|
177 |
-
def
|
178 |
history,
|
179 |
dataset_id,
|
180 |
notebook_file,
|
181 |
-
profile: gr.OAuthProfile | None,
|
182 |
-
oauth_token: gr.OAuthToken | None,
|
183 |
):
|
184 |
logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
|
185 |
-
if not profile or not oauth_token:
|
186 |
-
yield history + [
|
187 |
-
gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
|
188 |
-
]
|
189 |
-
return
|
190 |
-
logging.info(f"Profile: {profile}, token: {oauth_token.token}")
|
191 |
|
192 |
-
notebook_name =
|
193 |
-
api = HfApi(token=
|
194 |
try:
|
195 |
-
logging.info(f"About to push {notebook_file} - {
|
196 |
api.upload_file(
|
197 |
path_or_fileobj=notebook_file,
|
198 |
path_in_repo=notebook_name,
|
199 |
-
repo_id=
|
200 |
repo_type="dataset",
|
201 |
)
|
202 |
-
link = f"https://huggingface.co/datasets/{
|
203 |
logging.info(f"Notebook pushed to hub: {link}")
|
204 |
yield history + [
|
205 |
gr.ChatMessage(
|
206 |
role="user",
|
207 |
-
content=f"[
|
208 |
)
|
209 |
]
|
|
|
210 |
except Exception as e:
|
211 |
logging.info("Failed to push notebook", e)
|
212 |
yield history + [gr.ChatMessage(role="assistant", content=e)]
|
@@ -292,31 +295,50 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
|
|
292 |
notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
|
293 |
create_notebook_file(commands, notebook_name=notebook_name)
|
294 |
messages.append(
|
295 |
-
gr.ChatMessage(role="user", content="
|
296 |
-
)
|
297 |
-
yield messages
|
298 |
-
messages.append(
|
299 |
-
gr.ChatMessage(
|
300 |
-
role="user",
|
301 |
-
content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
|
302 |
-
)
|
303 |
)
|
304 |
yield messages
|
|
|
305 |
|
306 |
|
307 |
def coming_soon_message():
|
308 |
return gr.Info("Coming soon")
|
309 |
|
310 |
|
311 |
-
|
|
|
|
|
|
|
|
|
312 |
gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
|
313 |
-
with gr.Row():
|
314 |
-
with gr.Column(scale=
|
|
|
|
|
315 |
dataset_name = HuggingfaceHubSearch(
|
316 |
label="Hub Dataset ID",
|
317 |
placeholder="Search for dataset id on Huggingface",
|
318 |
search_type="dataset",
|
319 |
-
value="",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
320 |
)
|
321 |
|
322 |
@gr.render(inputs=dataset_name)
|
@@ -334,50 +356,44 @@ with gr.Blocks(fill_height=True) as demo:
|
|
334 |
return gr.HTML(value=html_code)
|
335 |
|
336 |
with gr.Row():
|
337 |
-
generate_eda_btn = gr.Button("
|
338 |
-
generate_embedding_btn = gr.Button("
|
339 |
-
generate_rag_btn = gr.Button("
|
340 |
-
generate_training_btn = gr.Button(
|
341 |
-
|
342 |
-
|
343 |
-
|
344 |
-
type="messages",
|
345 |
-
avatar_images=(
|
346 |
-
None,
|
347 |
-
None,
|
348 |
-
),
|
349 |
-
)
|
350 |
with gr.Row():
|
351 |
-
|
352 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
353 |
notebook_file = gr.File(visible=False)
|
354 |
generate_eda_btn.click(
|
355 |
generate_eda_cells,
|
356 |
inputs=[dataset_name],
|
357 |
-
outputs=[chatbot,
|
358 |
)
|
359 |
|
360 |
-
|
361 |
-
|
362 |
inputs=[dataset_name],
|
363 |
-
outputs=[chatbot,
|
364 |
)
|
365 |
|
366 |
-
|
367 |
-
|
368 |
inputs=[dataset_name],
|
369 |
-
outputs=[chatbot,
|
370 |
)
|
371 |
|
372 |
generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
|
373 |
-
|
374 |
-
push_to_hub,
|
375 |
-
inputs=[
|
376 |
-
chatbot,
|
377 |
-
dataset_name,
|
378 |
-
notebook_file,
|
379 |
-
],
|
380 |
-
outputs=[chatbot],
|
381 |
-
)
|
382 |
|
383 |
demo.launch()
|
|
|
16 |
generate_eda_system_prompt,
|
17 |
generate_embedding_system_prompt,
|
18 |
)
|
19 |
+
from dotenv import load_dotenv
|
20 |
+
import os
|
21 |
|
22 |
"""
|
23 |
TODOs:
|
|
|
38 |
"""
|
39 |
|
40 |
# Configuration
|
41 |
+
|
42 |
+
load_dotenv()
|
43 |
+
|
44 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
45 |
+
NOTEBOOKS_REPOSITORY = os.getenv("NOTEBOOKS_REPOSITORY")
|
46 |
+
assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
|
47 |
+
assert (
|
48 |
+
NOTEBOOKS_REPOSITORY is not None
|
49 |
+
), "You need to set NOTEBOOKS_REPOSITORY in your environment variables"
|
50 |
+
|
51 |
+
|
52 |
BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
|
53 |
HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
|
54 |
|
|
|
114 |
return content
|
115 |
except Exception as e:
|
116 |
gr.Error("Error when parsing notebook, try again.")
|
117 |
+
logging.error(f"Failed to parse code: {e}")
|
118 |
raise
|
119 |
|
120 |
|
|
|
152 |
return match.group(1)
|
153 |
|
154 |
|
155 |
+
def generate_eda_cells(dataset_id):
|
156 |
for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
|
157 |
+
yield messages, None # Keep button hidden
|
158 |
|
159 |
yield (
|
160 |
messages,
|
|
|
161 |
f"{dataset_id.replace('/', '-')}-eda.ipynb",
|
162 |
)
|
163 |
|
164 |
|
165 |
+
def generate_rag_cells(dataset_id):
|
166 |
for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
|
167 |
+
yield messages, None # Keep button hidden
|
168 |
|
169 |
yield (
|
170 |
messages,
|
|
|
171 |
f"{dataset_id.replace('/', '-')}-rag.ipynb",
|
172 |
)
|
173 |
|
174 |
|
175 |
+
def generate_embedding_cells(dataset_id):
|
176 |
for messages in generate_cells(
|
177 |
dataset_id, generate_embedding_system_prompt, "embedding"
|
178 |
):
|
179 |
+
yield messages, None # Keep button hidden
|
180 |
|
181 |
yield (
|
182 |
messages,
|
|
|
183 |
f"{dataset_id.replace('/', '-')}-embedding.ipynb",
|
184 |
)
|
185 |
|
186 |
|
187 |
+
def _push_to_hub(
|
188 |
history,
|
189 |
dataset_id,
|
190 |
notebook_file,
|
|
|
|
|
191 |
):
|
192 |
logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
|
|
|
|
|
|
|
|
|
|
|
|
|
193 |
|
194 |
+
notebook_name = notebook_file.split("/")[-1]
|
195 |
+
api = HfApi(token=HF_TOKEN)
|
196 |
try:
|
197 |
+
logging.info(f"About to push {notebook_file} - {dataset_id}")
|
198 |
api.upload_file(
|
199 |
path_or_fileobj=notebook_file,
|
200 |
path_in_repo=notebook_name,
|
201 |
+
repo_id=NOTEBOOKS_REPOSITORY,
|
202 |
repo_type="dataset",
|
203 |
)
|
204 |
+
link = f"https://huggingface.co/datasets/{NOTEBOOKS_REPOSITORY}/blob/main/{notebook_name}"
|
205 |
logging.info(f"Notebook pushed to hub: {link}")
|
206 |
yield history + [
|
207 |
gr.ChatMessage(
|
208 |
role="user",
|
209 |
+
content=f"[{notebook_name}]({link})",
|
210 |
)
|
211 |
]
|
212 |
+
|
213 |
except Exception as e:
|
214 |
logging.info("Failed to push notebook", e)
|
215 |
yield history + [gr.ChatMessage(role="assistant", content=e)]
|
|
|
295 |
notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
|
296 |
create_notebook_file(commands, notebook_name=notebook_name)
|
297 |
messages.append(
|
298 |
+
gr.ChatMessage(role="user", content="See the generated notebook on the Hub")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
299 |
)
|
300 |
yield messages
|
301 |
+
yield from _push_to_hub(messages, dataset_id, notebook_name)
|
302 |
|
303 |
|
304 |
def coming_soon_message():
|
305 |
return gr.Info("Coming soon")
|
306 |
|
307 |
|
308 |
+
def handle_example(example, button_action):
|
309 |
+
return button_action(example)
|
310 |
+
|
311 |
+
|
312 |
+
with gr.Blocks(fill_width=True) as demo:
|
313 |
gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
|
314 |
+
with gr.Row(equal_height=True):
|
315 |
+
with gr.Column(scale=2):
|
316 |
+
text_input = gr.Textbox(label="Suggested notebook type", visible=False)
|
317 |
+
|
318 |
dataset_name = HuggingfaceHubSearch(
|
319 |
label="Hub Dataset ID",
|
320 |
placeholder="Search for dataset id on Huggingface",
|
321 |
search_type="dataset",
|
322 |
+
value="jamescalam/world-cities-geo",
|
323 |
+
)
|
324 |
+
|
325 |
+
dataset_samples = gr.Examples(
|
326 |
+
examples=[
|
327 |
+
[
|
328 |
+
"infinite-dataset-hub/WorldPopCounts",
|
329 |
+
"Try this dataset for Exploratory Data Analysis",
|
330 |
+
],
|
331 |
+
[
|
332 |
+
"infinite-dataset-hub/GlobaleCuisineRecipes",
|
333 |
+
"Try this dataset for Embeddings generation",
|
334 |
+
],
|
335 |
+
[
|
336 |
+
"infinite-dataset-hub/GlobalBestSellersSummaries",
|
337 |
+
"Try this dataset for RAG generation",
|
338 |
+
],
|
339 |
+
],
|
340 |
+
inputs=[dataset_name, text_input],
|
341 |
+
cache_examples=False,
|
342 |
)
|
343 |
|
344 |
@gr.render(inputs=dataset_name)
|
|
|
356 |
return gr.HTML(value=html_code)
|
357 |
|
358 |
with gr.Row():
|
359 |
+
generate_eda_btn = gr.Button("Exploratory Data Analysis")
|
360 |
+
generate_embedding_btn = gr.Button("Data Embeddings")
|
361 |
+
generate_rag_btn = gr.Button("RAG")
|
362 |
+
generate_training_btn = gr.Button(
|
363 |
+
"Training - Coming soon", interactive=False
|
364 |
+
)
|
365 |
+
with gr.Column(scale=1):
|
|
|
|
|
|
|
|
|
|
|
|
|
366 |
with gr.Row():
|
367 |
+
chatbot = gr.Chatbot(
|
368 |
+
label="Results",
|
369 |
+
type="messages",
|
370 |
+
height=650,
|
371 |
+
avatar_images=(
|
372 |
+
None,
|
373 |
+
None,
|
374 |
+
),
|
375 |
+
)
|
376 |
+
|
377 |
notebook_file = gr.File(visible=False)
|
378 |
generate_eda_btn.click(
|
379 |
generate_eda_cells,
|
380 |
inputs=[dataset_name],
|
381 |
+
outputs=[chatbot, notebook_file],
|
382 |
)
|
383 |
|
384 |
+
generate_embedding_btn.click(
|
385 |
+
generate_embedding_cells,
|
386 |
inputs=[dataset_name],
|
387 |
+
outputs=[chatbot, notebook_file],
|
388 |
)
|
389 |
|
390 |
+
generate_rag_btn.click(
|
391 |
+
generate_rag_cells,
|
392 |
inputs=[dataset_name],
|
393 |
+
outputs=[chatbot, notebook_file],
|
394 |
)
|
395 |
|
396 |
generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
|
397 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
398 |
|
399 |
demo.launch()
|
utils/prompts.py
CHANGED
@@ -6,8 +6,9 @@ def generate_mapping_prompt(code):
|
|
6 |
"""Format the following python code to a list of cells to be used in a jupyter notebook:
|
7 |
{{ code }}
|
8 |
|
9 |
-
|
10 |
-
|
|
|
11 |
|
12 |
```json
|
13 |
[
|
@@ -42,7 +43,7 @@ def generate_eda_system_prompt():
|
|
42 |
You create Exploratory Data Analysis jupyter notebooks with the following content:
|
43 |
|
44 |
1. Install an import libraries
|
45 |
-
2. Load the
|
46 |
3. Understand the dataset
|
47 |
4. Check for missing values
|
48 |
5. Identify the data types of each column
|
@@ -70,12 +71,12 @@ def generate_eda_system_prompt():
|
|
70 |
@outlines.prompt
|
71 |
def generate_embedding_system_prompt():
|
72 |
"""You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
|
73 |
-
You
|
74 |
You create a jupyter notebooks with the following content:
|
75 |
|
76 |
-
1. Install libraries
|
77 |
2. Import libraries
|
78 |
-
3. Load dataset as dataframe
|
79 |
4. Choose column to be used for the embeddings
|
80 |
5. Remove duplicate data
|
81 |
6. Load column as a list
|
@@ -103,12 +104,13 @@ def generate_embedding_system_prompt():
|
|
103 |
def generate_rag_system_prompt():
|
104 |
"""You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
|
105 |
The data is provided as a pandas DataFrame with the following structure:
|
|
|
106 |
|
107 |
You create Exploratory RAG jupyter notebooks with the following content:
|
108 |
|
109 |
1. Install libraries
|
110 |
2. Import libraries
|
111 |
-
3. Load dataset as dataframe
|
112 |
4. Choose column to be used for the embeddings
|
113 |
5. Remove duplicate data
|
114 |
6. Load column as a list
|
@@ -116,8 +118,8 @@ def generate_rag_system_prompt():
|
|
116 |
8. Create FAISS index
|
117 |
9. Ask a query sample and encode it
|
118 |
10. Search similar documents based on the query sample and the FAISS index
|
119 |
-
11. Load HuggingFaceH4/zephyr-7b-beta model from transformers library and create a pipeline
|
120 |
-
12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar
|
121 |
13. Send the prompt to the pipeline and show answer
|
122 |
|
123 |
Ensure the notebook is well-organized, with explanations for each step.
|
|
|
6 |
"""Format the following python code to a list of cells to be used in a jupyter notebook:
|
7 |
{{ code }}
|
8 |
|
9 |
+
## Instruction
|
10 |
+
Before returning the result, evaluate if the json object is well formatted, if not, fix it.
|
11 |
+
The output should be a list of json objects with the following schema, including the leading and trailing "```json" and "```":
|
12 |
|
13 |
```json
|
14 |
[
|
|
|
43 |
You create Exploratory Data Analysis jupyter notebooks with the following content:
|
44 |
|
45 |
1. Install an import libraries
|
46 |
+
2. Load dataset as dataframe using the provided loading data code snippet
|
47 |
3. Understand the dataset
|
48 |
4. Check for missing values
|
49 |
5. Identify the data types of each column
|
|
|
71 |
@outlines.prompt
|
72 |
def generate_embedding_system_prompt():
|
73 |
"""You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
|
74 |
+
You must use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model and 'faiss-cpu' to create the index.
|
75 |
You create a jupyter notebooks with the following content:
|
76 |
|
77 |
+
1. Install libraries as !pip install
|
78 |
2. Import libraries
|
79 |
+
3. Load dataset as dataframe using the provided loading data code snippet
|
80 |
4. Choose column to be used for the embeddings
|
81 |
5. Remove duplicate data
|
82 |
6. Load column as a list
|
|
|
104 |
def generate_rag_system_prompt():
|
105 |
"""You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
|
106 |
The data is provided as a pandas DataFrame with the following structure:
|
107 |
+
You can use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index and 'transformers' for inference.
|
108 |
|
109 |
You create Exploratory RAG jupyter notebooks with the following content:
|
110 |
|
111 |
1. Install libraries
|
112 |
2. Import libraries
|
113 |
+
3. Load dataset as dataframe using the provided loading data code snippet
|
114 |
4. Choose column to be used for the embeddings
|
115 |
5. Remove duplicate data
|
116 |
6. Load column as a list
|
|
|
118 |
8. Create FAISS index
|
119 |
9. Ask a query sample and encode it
|
120 |
10. Search similar documents based on the query sample and the FAISS index
|
121 |
+
11. Load 'HuggingFaceH4/zephyr-7b-beta model' from transformers library and create a pipeline
|
122 |
+
12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar documents and a 'user' part with the query
|
123 |
13. Send the prompt to the pipeline and show answer
|
124 |
|
125 |
Ensure the notebook is well-organized, with explanations for each step.
|