asoria HF Staff commited on
Commit
ae0bcb8
·
1 Parent(s): d51b1c2

Adding sample datasets

Browse files
Files changed (2) hide show
  1. app.py +84 -68
  2. utils/prompts.py +11 -9
app.py CHANGED
@@ -16,6 +16,8 @@ from utils.prompts import (
16
  generate_eda_system_prompt,
17
  generate_embedding_system_prompt,
18
  )
 
 
19
 
20
  """
21
  TODOs:
@@ -36,6 +38,17 @@ TODOs:
36
  """
37
 
38
  # Configuration
 
 
 
 
 
 
 
 
 
 
 
39
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
40
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
41
 
@@ -101,7 +114,7 @@ def get_txt_from_output(output):
101
  return content
102
  except Exception as e:
103
  gr.Error("Error when parsing notebook, try again.")
104
- logging.error(f"Failed to fetch compatible libraries: {e}")
105
  raise
106
 
107
 
@@ -139,74 +152,64 @@ def content_from_output(output):
139
  return match.group(1)
140
 
141
 
142
- def generate_eda_cells(dataset_id, profile: gr.OAuthProfile | None):
143
  for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
144
- yield messages, gr.update(visible=False), None # Keep button hidden
145
 
146
  yield (
147
  messages,
148
- gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
149
  f"{dataset_id.replace('/', '-')}-eda.ipynb",
150
  )
151
 
152
 
153
- def generate_rag_cells(dataset_id, profile: gr.OAuthProfile | None):
154
  for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
155
- yield messages, gr.update(visible=False), None # Keep button hidden
156
 
157
  yield (
158
  messages,
159
- gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
160
  f"{dataset_id.replace('/', '-')}-rag.ipynb",
161
  )
162
 
163
 
164
- def generate_embedding_cells(dataset_id, profile: gr.OAuthProfile | None):
165
  for messages in generate_cells(
166
  dataset_id, generate_embedding_system_prompt, "embedding"
167
  ):
168
- yield messages, gr.update(visible=False), None # Keep button hidden
169
 
170
  yield (
171
  messages,
172
- gr.update(visible=profile and dataset_id.split("/")[0] == profile.username),
173
  f"{dataset_id.replace('/', '-')}-embedding.ipynb",
174
  )
175
 
176
 
177
- def push_to_hub(
178
  history,
179
  dataset_id,
180
  notebook_file,
181
- profile: gr.OAuthProfile | None,
182
- oauth_token: gr.OAuthToken | None,
183
  ):
184
  logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
185
- if not profile or not oauth_token:
186
- yield history + [
187
- gr.ChatMessage(role="assistant", content="⏳ _Login to push to hub..._")
188
- ]
189
- return
190
- logging.info(f"Profile: {profile}, token: {oauth_token.token}")
191
 
192
- notebook_name = "dataset_analysis.ipynb"
193
- api = HfApi(token=oauth_token.token)
194
  try:
195
- logging.info(f"About to push {notebook_file} - {notebook_name} - {dataset_id}")
196
  api.upload_file(
197
  path_or_fileobj=notebook_file,
198
  path_in_repo=notebook_name,
199
- repo_id=dataset_id,
200
  repo_type="dataset",
201
  )
202
- link = f"https://huggingface.co/datasets/{dataset_id}/blob/main/{notebook_name}"
203
  logging.info(f"Notebook pushed to hub: {link}")
204
  yield history + [
205
  gr.ChatMessage(
206
  role="user",
207
- content=f"[See the notebook on the Hub]({link})",
208
  )
209
  ]
 
210
  except Exception as e:
211
  logging.info("Failed to push notebook", e)
212
  yield history + [gr.ChatMessage(role="assistant", content=e)]
@@ -292,31 +295,50 @@ def generate_cells(dataset_id, prompt_fn, notebook_type="eda"):
292
  notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
293
  create_notebook_file(commands, notebook_name=notebook_name)
294
  messages.append(
295
- gr.ChatMessage(role="user", content="Here is the generated notebook file")
296
- )
297
- yield messages
298
- messages.append(
299
- gr.ChatMessage(
300
- role="user",
301
- content=FileData(path=notebook_name, mime_type="application/x-ipynb+json"),
302
- )
303
  )
304
  yield messages
 
305
 
306
 
307
  def coming_soon_message():
308
  return gr.Info("Coming soon")
309
 
310
 
311
- with gr.Blocks(fill_height=True) as demo:
 
 
 
 
312
  gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
313
- with gr.Row():
314
- with gr.Column(scale=1):
 
 
315
  dataset_name = HuggingfaceHubSearch(
316
  label="Hub Dataset ID",
317
  placeholder="Search for dataset id on Huggingface",
318
  search_type="dataset",
319
- value="",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
320
  )
321
 
322
  @gr.render(inputs=dataset_name)
@@ -334,50 +356,44 @@ with gr.Blocks(fill_height=True) as demo:
334
  return gr.HTML(value=html_code)
335
 
336
  with gr.Row():
337
- generate_eda_btn = gr.Button("Generate EDA notebook")
338
- generate_embedding_btn = gr.Button("Generate Embeddings notebook")
339
- generate_rag_btn = gr.Button("Generate RAG notebook")
340
- generate_training_btn = gr.Button("Generate Training notebook")
341
- with gr.Column():
342
- chatbot = gr.Chatbot(
343
- label="Results",
344
- type="messages",
345
- avatar_images=(
346
- None,
347
- None,
348
- ),
349
- )
350
  with gr.Row():
351
- login_btn = gr.LoginButton()
352
- push_btn = gr.Button("Push to hub", visible=False)
 
 
 
 
 
 
 
 
353
  notebook_file = gr.File(visible=False)
354
  generate_eda_btn.click(
355
  generate_eda_cells,
356
  inputs=[dataset_name],
357
- outputs=[chatbot, push_btn, notebook_file],
358
  )
359
 
360
- generate_rag_btn.click(
361
- generate_rag_cells,
362
  inputs=[dataset_name],
363
- outputs=[chatbot, push_btn, notebook_file],
364
  )
365
 
366
- generate_embedding_btn.click(
367
- generate_embedding_cells,
368
  inputs=[dataset_name],
369
- outputs=[chatbot, push_btn, notebook_file],
370
  )
371
 
372
  generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
373
- push_btn.click(
374
- push_to_hub,
375
- inputs=[
376
- chatbot,
377
- dataset_name,
378
- notebook_file,
379
- ],
380
- outputs=[chatbot],
381
- )
382
 
383
  demo.launch()
 
16
  generate_eda_system_prompt,
17
  generate_embedding_system_prompt,
18
  )
19
+ from dotenv import load_dotenv
20
+ import os
21
 
22
  """
23
  TODOs:
 
38
  """
39
 
40
  # Configuration
41
+
42
+ load_dotenv()
43
+
44
+ HF_TOKEN = os.getenv("HF_TOKEN")
45
+ NOTEBOOKS_REPOSITORY = os.getenv("NOTEBOOKS_REPOSITORY")
46
+ assert HF_TOKEN is not None, "You need to set HF_TOKEN in your environment variables"
47
+ assert (
48
+ NOTEBOOKS_REPOSITORY is not None
49
+ ), "You need to set NOTEBOOKS_REPOSITORY in your environment variables"
50
+
51
+
52
  BASE_DATASETS_SERVER_URL = "https://datasets-server.huggingface.co"
53
  HEADERS = {"Accept": "application/json", "Content-Type": "application/json"}
54
 
 
114
  return content
115
  except Exception as e:
116
  gr.Error("Error when parsing notebook, try again.")
117
+ logging.error(f"Failed to parse code: {e}")
118
  raise
119
 
120
 
 
152
  return match.group(1)
153
 
154
 
155
+ def generate_eda_cells(dataset_id):
156
  for messages in generate_cells(dataset_id, generate_eda_system_prompt, "eda"):
157
+ yield messages, None # Keep button hidden
158
 
159
  yield (
160
  messages,
 
161
  f"{dataset_id.replace('/', '-')}-eda.ipynb",
162
  )
163
 
164
 
165
+ def generate_rag_cells(dataset_id):
166
  for messages in generate_cells(dataset_id, generate_rag_system_prompt, "rag"):
167
+ yield messages, None # Keep button hidden
168
 
169
  yield (
170
  messages,
 
171
  f"{dataset_id.replace('/', '-')}-rag.ipynb",
172
  )
173
 
174
 
175
+ def generate_embedding_cells(dataset_id):
176
  for messages in generate_cells(
177
  dataset_id, generate_embedding_system_prompt, "embedding"
178
  ):
179
+ yield messages, None # Keep button hidden
180
 
181
  yield (
182
  messages,
 
183
  f"{dataset_id.replace('/', '-')}-embedding.ipynb",
184
  )
185
 
186
 
187
+ def _push_to_hub(
188
  history,
189
  dataset_id,
190
  notebook_file,
 
 
191
  ):
192
  logging.info(f"Pushing notebook to hub: {dataset_id} on file {notebook_file}")
 
 
 
 
 
 
193
 
194
+ notebook_name = notebook_file.split("/")[-1]
195
+ api = HfApi(token=HF_TOKEN)
196
  try:
197
+ logging.info(f"About to push {notebook_file} - {dataset_id}")
198
  api.upload_file(
199
  path_or_fileobj=notebook_file,
200
  path_in_repo=notebook_name,
201
+ repo_id=NOTEBOOKS_REPOSITORY,
202
  repo_type="dataset",
203
  )
204
+ link = f"https://huggingface.co/datasets/{NOTEBOOKS_REPOSITORY}/blob/main/{notebook_name}"
205
  logging.info(f"Notebook pushed to hub: {link}")
206
  yield history + [
207
  gr.ChatMessage(
208
  role="user",
209
+ content=f"[{notebook_name}]({link})",
210
  )
211
  ]
212
+
213
  except Exception as e:
214
  logging.info("Failed to push notebook", e)
215
  yield history + [gr.ChatMessage(role="assistant", content=e)]
 
295
  notebook_name = f"{dataset_id.replace('/', '-')}-{notebook_type}.ipynb"
296
  create_notebook_file(commands, notebook_name=notebook_name)
297
  messages.append(
298
+ gr.ChatMessage(role="user", content="See the generated notebook on the Hub")
 
 
 
 
 
 
 
299
  )
300
  yield messages
301
+ yield from _push_to_hub(messages, dataset_id, notebook_name)
302
 
303
 
304
  def coming_soon_message():
305
  return gr.Info("Coming soon")
306
 
307
 
308
+ def handle_example(example, button_action):
309
+ return button_action(example)
310
+
311
+
312
+ with gr.Blocks(fill_width=True) as demo:
313
  gr.Markdown("# 🤖 Dataset notebook creator 🕵️")
314
+ with gr.Row(equal_height=True):
315
+ with gr.Column(scale=2):
316
+ text_input = gr.Textbox(label="Suggested notebook type", visible=False)
317
+
318
  dataset_name = HuggingfaceHubSearch(
319
  label="Hub Dataset ID",
320
  placeholder="Search for dataset id on Huggingface",
321
  search_type="dataset",
322
+ value="jamescalam/world-cities-geo",
323
+ )
324
+
325
+ dataset_samples = gr.Examples(
326
+ examples=[
327
+ [
328
+ "infinite-dataset-hub/WorldPopCounts",
329
+ "Try this dataset for Exploratory Data Analysis",
330
+ ],
331
+ [
332
+ "infinite-dataset-hub/GlobaleCuisineRecipes",
333
+ "Try this dataset for Embeddings generation",
334
+ ],
335
+ [
336
+ "infinite-dataset-hub/GlobalBestSellersSummaries",
337
+ "Try this dataset for RAG generation",
338
+ ],
339
+ ],
340
+ inputs=[dataset_name, text_input],
341
+ cache_examples=False,
342
  )
343
 
344
  @gr.render(inputs=dataset_name)
 
356
  return gr.HTML(value=html_code)
357
 
358
  with gr.Row():
359
+ generate_eda_btn = gr.Button("Exploratory Data Analysis")
360
+ generate_embedding_btn = gr.Button("Data Embeddings")
361
+ generate_rag_btn = gr.Button("RAG")
362
+ generate_training_btn = gr.Button(
363
+ "Training - Coming soon", interactive=False
364
+ )
365
+ with gr.Column(scale=1):
 
 
 
 
 
 
366
  with gr.Row():
367
+ chatbot = gr.Chatbot(
368
+ label="Results",
369
+ type="messages",
370
+ height=650,
371
+ avatar_images=(
372
+ None,
373
+ None,
374
+ ),
375
+ )
376
+
377
  notebook_file = gr.File(visible=False)
378
  generate_eda_btn.click(
379
  generate_eda_cells,
380
  inputs=[dataset_name],
381
+ outputs=[chatbot, notebook_file],
382
  )
383
 
384
+ generate_embedding_btn.click(
385
+ generate_embedding_cells,
386
  inputs=[dataset_name],
387
+ outputs=[chatbot, notebook_file],
388
  )
389
 
390
+ generate_rag_btn.click(
391
+ generate_rag_cells,
392
  inputs=[dataset_name],
393
+ outputs=[chatbot, notebook_file],
394
  )
395
 
396
  generate_training_btn.click(coming_soon_message, inputs=[], outputs=[])
397
+
 
 
 
 
 
 
 
 
398
 
399
  demo.launch()
utils/prompts.py CHANGED
@@ -6,8 +6,9 @@ def generate_mapping_prompt(code):
6
  """Format the following python code to a list of cells to be used in a jupyter notebook:
7
  {{ code }}
8
 
9
- The output should be a list of json objects with the
10
- following schema, including the leading and trailing "```json" and "```":
 
11
 
12
  ```json
13
  [
@@ -42,7 +43,7 @@ def generate_eda_system_prompt():
42
  You create Exploratory Data Analysis jupyter notebooks with the following content:
43
 
44
  1. Install an import libraries
45
- 2. Load the dataset
46
  3. Understand the dataset
47
  4. Check for missing values
48
  5. Identify the data types of each column
@@ -70,12 +71,12 @@ def generate_eda_system_prompt():
70
  @outlines.prompt
71
  def generate_embedding_system_prompt():
72
  """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
73
- You can use only the following libraries: Pandas for data manipulation, sentence-transformers to load the embedding model and FAISS to create the index.
74
  You create a jupyter notebooks with the following content:
75
 
76
- 1. Install libraries
77
  2. Import libraries
78
- 3. Load dataset as dataframe
79
  4. Choose column to be used for the embeddings
80
  5. Remove duplicate data
81
  6. Load column as a list
@@ -103,12 +104,13 @@ def generate_embedding_system_prompt():
103
  def generate_rag_system_prompt():
104
  """You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
105
  The data is provided as a pandas DataFrame with the following structure:
 
106
 
107
  You create Exploratory RAG jupyter notebooks with the following content:
108
 
109
  1. Install libraries
110
  2. Import libraries
111
- 3. Load dataset as dataframe
112
  4. Choose column to be used for the embeddings
113
  5. Remove duplicate data
114
  6. Load column as a list
@@ -116,8 +118,8 @@ def generate_rag_system_prompt():
116
  8. Create FAISS index
117
  9. Ask a query sample and encode it
118
  10. Search similar documents based on the query sample and the FAISS index
119
- 11. Load HuggingFaceH4/zephyr-7b-beta model from transformers library and create a pipeline
120
- 12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar docuemnts and a 'user' part with the query
121
  13. Send the prompt to the pipeline and show answer
122
 
123
  Ensure the notebook is well-organized, with explanations for each step.
 
6
  """Format the following python code to a list of cells to be used in a jupyter notebook:
7
  {{ code }}
8
 
9
+ ## Instruction
10
+ Before returning the result, evaluate if the json object is well formatted, if not, fix it.
11
+ The output should be a list of json objects with the following schema, including the leading and trailing "```json" and "```":
12
 
13
  ```json
14
  [
 
43
  You create Exploratory Data Analysis jupyter notebooks with the following content:
44
 
45
  1. Install an import libraries
46
+ 2. Load dataset as dataframe using the provided loading data code snippet
47
  3. Understand the dataset
48
  4. Check for missing values
49
  5. Identify the data types of each column
 
71
  @outlines.prompt
72
  def generate_embedding_system_prompt():
73
  """You are an expert data scientist tasked with generating a Jupyter notebook to generate embeddings on a specific dataset.
74
+ You must use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model and 'faiss-cpu' to create the index.
75
  You create a jupyter notebooks with the following content:
76
 
77
+ 1. Install libraries as !pip install
78
  2. Import libraries
79
+ 3. Load dataset as dataframe using the provided loading data code snippet
80
  4. Choose column to be used for the embeddings
81
  5. Remove duplicate data
82
  6. Load column as a list
 
104
  def generate_rag_system_prompt():
105
  """You are an expert machine learning engineer tasked with generating a Jupyter notebook to showcase a Retrieval-Augmented Generation (RAG) system based on a specific dataset.
106
  The data is provided as a pandas DataFrame with the following structure:
107
+ You can use only the following libraries: 'pandas' for data manipulation, 'sentence-transformers' to load the embedding model, 'faiss-cpu' to create the index and 'transformers' for inference.
108
 
109
  You create Exploratory RAG jupyter notebooks with the following content:
110
 
111
  1. Install libraries
112
  2. Import libraries
113
+ 3. Load dataset as dataframe using the provided loading data code snippet
114
  4. Choose column to be used for the embeddings
115
  5. Remove duplicate data
116
  6. Load column as a list
 
118
  8. Create FAISS index
119
  9. Ask a query sample and encode it
120
  10. Search similar documents based on the query sample and the FAISS index
121
+ 11. Load 'HuggingFaceH4/zephyr-7b-beta model' from transformers library and create a pipeline
122
+ 12. Create a prompt with two parts: 'system' to give instructions to answer a question based on a 'context' that is the retrieved similar documents and a 'user' part with the query
123
  13. Send the prompt to the pipeline and show answer
124
 
125
  Ensure the notebook is well-organized, with explanations for each step.