lhoestq HF staff commited on
Commit
06adbe1
·
1 Parent(s): df06525

add save dataset

Browse files
Files changed (1) hide show
  1. app.py +96 -52
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import io
 
2
  import re
3
  import time
4
  from itertools import islice
@@ -10,16 +11,18 @@ from typing import Callable, Iterable, Iterator, Optional, TypeVar
10
  import gradio as gr
11
  import pandas as pd
12
  import requests.exceptions
13
- from huggingface_hub import InferenceClient
14
 
15
 
16
  model_id = "microsoft/Phi-3-mini-4k-instruct"
17
  client = InferenceClient(model_id)
 
18
 
19
  MAX_TOTAL_NB_ITEMS = 100 # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
20
  MAX_NB_ITEMS_PER_GENERATION_CALL = 10
21
  NUM_ROWS = 100
22
  NUM_VARIANTS = 10
 
23
  URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
24
 
25
  GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
@@ -54,8 +57,6 @@ LONG_RARITIES = [
54
  "very nice but still plausible",
55
  ]
56
 
57
- landing_page_query = "various datasets on many different subjects and topics, from classification to language modeling, from science to sport to finance to news"
58
-
59
  landing_page_datasets_generated_text = """
60
  1. NewsEventsPredict (classification, media, trend)
61
  2. FinancialForecast (economy, stocks, regression)
@@ -71,6 +72,29 @@ landing_page_datasets_generated_text = """
71
  default_output = landing_page_datasets_generated_text.strip().split("\n")
72
  assert len(default_output) == MAX_NB_ITEMS_PER_GENERATION_CALL
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  css = """
75
  a {
76
  color: var(--body-text-color);
@@ -145,35 +169,27 @@ a {
145
  color: transparent;
146
  background-clip: text;
147
  }
 
 
 
 
 
 
148
  """
149
 
150
 
151
  with gr.Blocks(css=css) as demo:
152
  generated_texts_state = gr.State((landing_page_datasets_generated_text,))
153
- with gr.Row():
154
- with gr.Column(scale=4, min_width=0):
155
- pass
156
- with gr.Column(scale=10):
157
- gr.Markdown(
158
- "# 🤗 Infinite Dataset Hub ♾️\n\n"
159
- "An endless catalog of datasets, created just for you.\n\n"
160
- )
161
- with gr.Column(scale=4, min_width=0):
162
- pass
163
  with gr.Column() as search_page:
164
  with gr.Row():
165
- with gr.Column(scale=4, min_width=0):
166
- pass
167
  with gr.Column(scale=10):
 
 
 
 
168
  with gr.Row():
169
  search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets, get infinite results", show_label=False, container=False, scale=9)
170
  search_button = gr.Button("🔍", variant="primary", scale=1)
171
- with gr.Column(scale=4, min_width=0):
172
- pass
173
- with gr.Row():
174
- with gr.Column(scale=4, min_width=0):
175
- pass
176
- with gr.Column(scale=10):
177
  button_groups: list[gr.Group] = []
178
  buttons: list[gr.Button] = []
179
  for i in range(MAX_TOTAL_NB_ITEMS):
@@ -195,20 +211,28 @@ with gr.Blocks(css=css) as demo:
195
 
196
  load_more_datasets = gr.Button("Load more datasets") # TODO: dosable when reaching end of page
197
  gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")
198
- with gr.Column(scale=4, min_width=0):
199
- pass
 
 
 
 
 
200
  with gr.Column(visible=False) as dataset_page:
 
 
 
 
201
  dataset_title = gr.Markdown()
202
  gr.Markdown("_Note: This is an AI-generated dataset so its content may be inaccurate or false_")
203
  dataset_content = gr.Markdown()
204
  generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
205
  dataset_dataframe = gr.DataFrame(visible=False, interactive=False, wrap=True)
206
  save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)
 
207
  dataset_share_button = gr.Button("Share Dataset URL")
208
  dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
209
  back_button = gr.Button("< Back", size="sm")
210
-
211
- app_state = gr.State({})
212
 
213
  ###################################
214
  #
@@ -254,7 +278,7 @@ with gr.Blocks(css=css) as demo:
254
 
255
  def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
256
  search_query = search_query or ""
257
- search_query = search_query[:1000] if search_query.strip() else landing_page_query
258
  generated_text = ""
259
  current_line = ""
260
  for token in stream_reponse(
@@ -273,7 +297,7 @@ with gr.Blocks(css=css) as demo:
273
 
274
  def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]:
275
  search_query = search_query or ""
276
- search_query = search_query[:1000] if search_query.strip() else landing_page_query
277
  generated_text = ""
278
  for token in stream_reponse(GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
279
  search_query=search_query,
@@ -418,7 +442,7 @@ with gr.Blocks(css=css) as demo:
418
 
419
 
420
  def _search_datasets(search_query):
421
- yield {generated_texts_state: [], app_state: {"search_query": search_query}}
422
  yield {
423
  button_group: gr.Group(elem_classes="buttonsGroup insivibleButtonGroup")
424
  for button_group in button_groups[MAX_NB_ITEMS_PER_GENERATION_CALL:]
@@ -453,12 +477,12 @@ with gr.Blocks(css=css) as demo:
453
  current_item_idx += 1
454
 
455
 
456
- @search_button.click(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state, app_state])
457
  def search_dataset_from_search_button(search_query):
458
  yield from _search_datasets(search_query)
459
 
460
 
461
- @search_bar.submit(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state, app_state])
462
  def search_dataset_from_search_bar(search_query):
463
  yield from _search_datasets(search_query)
464
 
@@ -497,20 +521,16 @@ with gr.Blocks(css=css) as demo:
497
  dataset_title: f"# {dataset_name}\n\n tags: {tags}",
498
  dataset_share_textbox: gr.Textbox(visible=False),
499
  dataset_dataframe: gr.DataFrame(visible=False),
500
- generate_full_dataset_button: gr.Button(visible=True),
501
  save_dataset_button: gr.Button(visible=False),
502
- app_state: {
503
- "search_query": search_query,
504
- "dataset_name": dataset_name,
505
- "tags": tags
506
- }
507
  }
508
  for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags):
509
  yield {dataset_content: generated_text}
510
 
511
 
512
  show_dataset_inputs = [search_bar, *buttons]
513
- show_dataset_outputs = [app_state, search_page, dataset_page, dataset_title, dataset_content, generate_full_dataset_button, dataset_dataframe, save_dataset_button, dataset_share_textbox]
514
  scroll_to_top_js = """
515
  function (...args) {
516
  console.log(args);
@@ -537,8 +557,8 @@ with gr.Blocks(css=css) as demo:
537
  return gr.Column(visible=True), gr.Column(visible=False)
538
 
539
 
540
- @generate_full_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar], outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button])
541
- def generate_full_dataset(title, content, search_query):
542
  dataset_name, tags = title.strip("# ").split("\ntags:", 1)
543
  dataset_name, tags = dataset_name.strip(), tags.strip()
544
  csv_header, preview_df = parse_preview_df(content)
@@ -556,8 +576,8 @@ with gr.Blocks(css=css) as demo:
556
  output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
557
  yield {
558
  dataset_dataframe: gr.DataFrame(pd.DataFrame([{"idx": i, **x} for i, x in enumerate(output) if x]), visible=True),
559
- generate_full_dataset_button: gr.Button(visible=False),
560
- save_dataset_button: gr.Button(visible=True, interactive=False)
561
  }
562
  kwargs_iterable = [
563
  {
@@ -573,24 +593,48 @@ with gr.Blocks(css=css) as demo:
573
  ]
574
  for _ in iflatmap_unordered(generate_partial_dataset, kwargs_iterable=kwargs_iterable):
575
  yield {dataset_dataframe: pd.DataFrame([{"idx": i, **{column_name: x.get(column_name) for column_name in columns}} for i, x in enumerate(output) if x])}
576
- yield {save_dataset_button: gr.Button(visible=True, interactive=True)}
577
  print(f"Generated {dataset_name}!")
578
 
579
 
580
- @save_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, dataset_dataframe])
581
- def save_dataset(title, content, search_query, df):
582
- raise gr.Error("Not implemented yet sorry ! Request your dataset to be saved in the Discussion tab (provide the dataset URL)")
583
-
584
-
585
- @dataset_share_button.click(inputs=[app_state], outputs=[dataset_share_textbox])
586
- def show_dataset_url(state):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
587
  return gr.Textbox(
588
- f"{URL}?q={state['search_query'].replace(' ', '+')}&dataset={state['dataset_name'].replace(' ', '+')}&tags={state['tags'].replace(' ', '+')}",
589
  visible=True,
590
  )
591
 
592
- @demo.load(outputs=[app_state, search_page, search_bar, dataset_page, dataset_title, dataset_content, dataset_share_textbox] + button_groups + buttons + [generated_texts_state])
593
- def load_app(request: gr.Request):
 
 
 
 
 
 
 
 
 
 
594
  query_params = dict(request.query_params)
595
  if "dataset" in query_params:
596
  yield from _show_dataset(
 
1
  import io
2
+ import os
3
  import re
4
  import time
5
  from itertools import islice
 
11
  import gradio as gr
12
  import pandas as pd
13
  import requests.exceptions
14
+ from huggingface_hub import InferenceClient, create_repo, whoami, DatasetCard
15
 
16
 
17
  model_id = "microsoft/Phi-3-mini-4k-instruct"
18
  client = InferenceClient(model_id)
19
+ save_dataset_hf_token = os.environ.get("SAVE_DATASET_HF_TOKEN")
20
 
21
  MAX_TOTAL_NB_ITEMS = 100 # almost infinite, don't judge me (actually it's because gradio needs a fixed number of components)
22
  MAX_NB_ITEMS_PER_GENERATION_CALL = 10
23
  NUM_ROWS = 100
24
  NUM_VARIANTS = 10
25
+ NAMESPACE = "infinite-dataset-hub"
26
  URL = "https://huggingface.co/spaces/infinite-dataset-hub/infinite-dataset-hub"
27
 
28
  GENERATE_DATASET_NAMES_FOR_SEARCH_QUERY = (
 
57
  "very nice but still plausible",
58
  ]
59
 
 
 
60
  landing_page_datasets_generated_text = """
61
  1. NewsEventsPredict (classification, media, trend)
62
  2. FinancialForecast (economy, stocks, regression)
 
72
  default_output = landing_page_datasets_generated_text.strip().split("\n")
73
  assert len(default_output) == MAX_NB_ITEMS_PER_GENERATION_CALL
74
 
75
+ DATASET_CARD_CONTENT = """
76
+ ---
77
+ license: mit
78
+ tags:
79
+ - infinite-dataset-hub
80
+ - synthetic
81
+ ---
82
+
83
+ {title}
84
+
85
+ _Note: This is an AI-generated dataset so its content may be inaccurate or false_
86
+
87
+ {content}
88
+
89
+ **Source of the data:**
90
+
91
+ The dataset was generated using the [Infinite Dataset Hub]({url}) and {model_id} using the query '{search_query}':
92
+
93
+ - **Dataset Generation Page**: {dataset_url}
94
+ - **Model**: https://huggingface.co/{model_id}
95
+ - **More Datasets**: https://huggingface.co/datasets?other=infinite-dataset-hub
96
+ """
97
+
98
  css = """
99
  a {
100
  color: var(--body-text-color);
 
169
  color: transparent;
170
  background-clip: text;
171
  }
172
+ .settings {
173
+ background: transparent;
174
+ }
175
+ .settings button span {
176
+ color: var(--body-text-color-subdued);
177
+ }
178
  """
179
 
180
 
181
  with gr.Blocks(css=css) as demo:
182
  generated_texts_state = gr.State((landing_page_datasets_generated_text,))
 
 
 
 
 
 
 
 
 
 
183
  with gr.Column() as search_page:
184
  with gr.Row():
 
 
185
  with gr.Column(scale=10):
186
+ gr.Markdown(
187
+ "# 🤗 Infinite Dataset Hub ♾️\n\n"
188
+ "An endless catalog of datasets, created just for you.\n\n"
189
+ )
190
  with gr.Row():
191
  search_bar = gr.Textbox(max_lines=1, placeholder="Search datasets, get infinite results", show_label=False, container=False, scale=9)
192
  search_button = gr.Button("🔍", variant="primary", scale=1)
 
 
 
 
 
 
193
  button_groups: list[gr.Group] = []
194
  buttons: list[gr.Button] = []
195
  for i in range(MAX_TOTAL_NB_ITEMS):
 
211
 
212
  load_more_datasets = gr.Button("Load more datasets") # TODO: dosable when reaching end of page
213
  gr.Markdown(f"_powered by [{model_id}](https://huggingface.co/{model_id})_")
214
+ with gr.Column(scale=4, min_width="200px"):
215
+ with gr.Accordion("Settings", open=False, elem_classes="settings"):
216
+ gr.Markdown("Save datasets to your account")
217
+ gr.LoginButton()
218
+ select_namespace_dropdown = gr.Dropdown(choices=[NAMESPACE], value=NAMESPACE, label="Select user or organization", visible=False)
219
+ gr.Markdown("Save datasets as public or private datasets")
220
+ visibility_radio = gr.Radio(["public", "private"], value="public", container=False, interactive=False)
221
  with gr.Column(visible=False) as dataset_page:
222
+ gr.Markdown(
223
+ "# 🤗 Infinite Dataset Hub ♾️\n\n"
224
+ "An endless catalog of datasets, created just for you.\n\n"
225
+ )
226
  dataset_title = gr.Markdown()
227
  gr.Markdown("_Note: This is an AI-generated dataset so its content may be inaccurate or false_")
228
  dataset_content = gr.Markdown()
229
  generate_full_dataset_button = gr.Button("Generate Full Dataset", variant="primary")
230
  dataset_dataframe = gr.DataFrame(visible=False, interactive=False, wrap=True)
231
  save_dataset_button = gr.Button("💾 Save Dataset", variant="primary", visible=False)
232
+ open_dataset_message = gr.Markdown("", visible=False)
233
  dataset_share_button = gr.Button("Share Dataset URL")
234
  dataset_share_textbox = gr.Textbox(visible=False, show_copy_button=True, label="Copy this URL:", interactive=False, show_label=True)
235
  back_button = gr.Button("< Back", size="sm")
 
 
236
 
237
  ###################################
238
  #
 
278
 
279
  def gen_datasets_line_by_line(search_query: str, generated_texts: tuple[str] = ()) -> Iterator[str]:
280
  search_query = search_query or ""
281
+ search_query = search_query[:1000] if search_query.strip() else ""
282
  generated_text = ""
283
  current_line = ""
284
  for token in stream_reponse(
 
297
 
298
  def gen_dataset_content(search_query: str, dataset_name: str, tags: str) -> Iterator[str]:
299
  search_query = search_query or ""
300
+ search_query = search_query[:1000] if search_query.strip() else ""
301
  generated_text = ""
302
  for token in stream_reponse(GENERATE_DATASET_CONTENT_FOR_SEARCH_QUERY_AND_NAME_AND_TAGS.format(
303
  search_query=search_query,
 
442
 
443
 
444
  def _search_datasets(search_query):
445
+ yield {generated_texts_state: []}
446
  yield {
447
  button_group: gr.Group(elem_classes="buttonsGroup insivibleButtonGroup")
448
  for button_group in button_groups[MAX_NB_ITEMS_PER_GENERATION_CALL:]
 
477
  current_item_idx += 1
478
 
479
 
480
+ @search_button.click(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
481
  def search_dataset_from_search_button(search_query):
482
  yield from _search_datasets(search_query)
483
 
484
 
485
+ @search_bar.submit(inputs=search_bar, outputs=button_groups + buttons + [generated_texts_state])
486
  def search_dataset_from_search_bar(search_query):
487
  yield from _search_datasets(search_query)
488
 
 
521
  dataset_title: f"# {dataset_name}\n\n tags: {tags}",
522
  dataset_share_textbox: gr.Textbox(visible=False),
523
  dataset_dataframe: gr.DataFrame(visible=False),
524
+ generate_full_dataset_button: gr.Button(interactive=True),
525
  save_dataset_button: gr.Button(visible=False),
526
+ open_dataset_message: gr.Markdown(visible=False)
 
 
 
 
527
  }
528
  for generated_text in gen_dataset_content(search_query=search_query, dataset_name=dataset_name, tags=tags):
529
  yield {dataset_content: generated_text}
530
 
531
 
532
  show_dataset_inputs = [search_bar, *buttons]
533
+ show_dataset_outputs = [search_page, dataset_page, dataset_title, dataset_content, generate_full_dataset_button, dataset_dataframe, save_dataset_button, open_dataset_message, dataset_share_textbox]
534
  scroll_to_top_js = """
535
  function (...args) {
536
  console.log(args);
 
557
  return gr.Column(visible=True), gr.Column(visible=False)
558
 
559
 
560
+ @generate_full_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, select_namespace_dropdown, visibility_radio], outputs=[dataset_dataframe, generate_full_dataset_button, save_dataset_button])
561
+ def generate_full_dataset(title, content, search_query, namespace, visability):
562
  dataset_name, tags = title.strip("# ").split("\ntags:", 1)
563
  dataset_name, tags = dataset_name.strip(), tags.strip()
564
  csv_header, preview_df = parse_preview_df(content)
 
576
  output[:len(preview_df)] = [{"idx": i, **x} for i, x in enumerate(preview_df.to_dict(orient="records"))]
577
  yield {
578
  dataset_dataframe: gr.DataFrame(pd.DataFrame([{"idx": i, **x} for i, x in enumerate(output) if x]), visible=True),
579
+ generate_full_dataset_button: gr.Button(interactive=False),
580
+ save_dataset_button: gr.Button(f"💾 Save Dataset {namespace}/{dataset_name}" + (" (private)" if visability != "public" else ""), visible=True, interactive=False)
581
  }
582
  kwargs_iterable = [
583
  {
 
593
  ]
594
  for _ in iflatmap_unordered(generate_partial_dataset, kwargs_iterable=kwargs_iterable):
595
  yield {dataset_dataframe: pd.DataFrame([{"idx": i, **{column_name: x.get(column_name) for column_name in columns}} for i, x in enumerate(output) if x])}
596
+ yield {save_dataset_button: gr.Button(interactive=True)}
597
  print(f"Generated {dataset_name}!")
598
 
599
 
600
+ @save_dataset_button.click(inputs=[dataset_title, dataset_content, search_bar, dataset_dataframe, select_namespace_dropdown, visibility_radio], outputs=[save_dataset_button, open_dataset_message])
601
+ def save_dataset(title: str, content: str, search_query: str, df: pd.DataFrame, namespace: str, visability: str, oauth_token: Optional[gr.OAuthToken]):
602
+ dataset_name, tags = title.strip("# ").split("\ntags:", 1)
603
+ dataset_name, tags = dataset_name.strip(), tags.strip()
604
+ token = oauth_token.token if oauth_token else save_dataset_hf_token
605
+ repo_id = f"{namespace}/{dataset_name}"
606
+ dataset_url = f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}"
607
+ gr.Info("Saving dataset...")
608
+ yield {save_dataset_button: gr.Button(interactive=False)}
609
+ create_repo(repo_id=repo_id, repo_type="dataset", private=visability!="public", exist_ok=True, token=token)
610
+ df.to_csv(f"hf://datasets/{repo_id}/data.csv", storage_options={"token": token}, index=False)
611
+ DatasetCard(DATASET_CARD_CONTENT.format(title=title, content=content, url=URL, dataset_url=dataset_url, model_id=model_id, search_query=search_query)).push_to_hub(repo_id=repo_id, repo_type="dataset", token=token)
612
+ gr.Info(f"✅ Dataset saved at {repo_id}")
613
+ additional_message = "PS: You can also save datasets under your account in the Settings ;)"
614
+ yield {open_dataset_message: gr.Markdown(f"# 🎉 Yay ! Your dataset has been saved to [{repo_id}](https://huggingface.co/datasets/{repo_id}) !\n\nDataset link: [https://huggingface.co/datasets/{repo_id}](https://huggingface.co/datasets/{repo_id})\n\n{additional_message}", visible=True)}
615
+
616
+
617
+ @dataset_share_button.click(inputs=[dataset_title, search_bar], outputs=[dataset_share_textbox])
618
+ def show_dataset_url(title, search_query):
619
+ dataset_name, tags = title.strip("# ").split("\ntags:", 1)
620
+ dataset_name, tags = dataset_name.strip(), tags.strip()
621
  return gr.Textbox(
622
+ f"{URL}?q={search_query.replace(' ', '+')}&dataset={dataset_name.replace(' ', '+')}&tags={tags.replace(' ', '+')}",
623
  visible=True,
624
  )
625
 
626
+ @demo.load(outputs=show_dataset_outputs + button_groups + buttons + [generated_texts_state] + [select_namespace_dropdown, visibility_radio])
627
+ def load_app(request: gr.Request, oauth_token: Optional[gr.OAuthToken]):
628
+ if oauth_token:
629
+ user_info = whoami(oauth_token.token)
630
+ yield {
631
+ select_namespace_dropdown: gr.Dropdown(
632
+ choices=[user_info["name"]] + [org_info["name"] for org_info in user_info["orgs"]],
633
+ value=user_info["name"],
634
+ visible=True,
635
+ ),
636
+ visibility_radio: gr.Radio(interactive=True),
637
+ }
638
  query_params = dict(request.query_params)
639
  if "dataset" in query_params:
640
  yield from _show_dataset(