davidberenstein1957 HF staff commited on
Commit
6521775
·
1 Parent(s): 44fc64a

remove obsolete code

Browse files
.python-version DELETED
@@ -1 +0,0 @@
1
- synthetic-data-generator
 
 
app.py CHANGED
@@ -1,4 +1,9 @@
 
 
1
  from distilabel_dataset_generator.app import demo
2
 
 
 
 
3
  if __name__ == "__main__":
4
  demo.launch()
 
1
+ import os
2
+
3
  from distilabel_dataset_generator.app import demo
4
 
5
+ os.environ["API_KEY"] = "hf_..."
6
+ os.environ["API_KEY"] = "hf_..."
7
+
8
  if __name__ == "__main__":
9
  demo.launch()
src/distilabel_dataset_generator/apps/base.py CHANGED
@@ -1,6 +1,6 @@
1
  import io
2
  import uuid
3
- from typing import Any, Callable, List, Tuple, Union
4
 
5
  import argilla as rg
6
  import gradio as gr
@@ -11,161 +11,14 @@ from gradio import OAuthToken
11
  from huggingface_hub import HfApi, upload_file
12
 
13
  from distilabel_dataset_generator.utils import (
14
- _LOGGED_OUT_CSS,
15
  get_argilla_client,
16
- get_login_button,
17
  list_orgs,
18
- swap_visibility,
19
  )
20
 
21
  TEXTCAT_TASK = "text_classification"
22
  SFT_TASK = "supervised_fine_tuning"
23
 
24
 
25
- def get_main_ui(
26
- default_dataset_descriptions: List[str],
27
- default_system_prompts: List[str],
28
- default_datasets: List[pd.DataFrame],
29
- fn_generate_system_prompt: Callable,
30
- fn_generate_dataset: Callable,
31
- task: str,
32
- ):
33
- def fn_generate_sample_dataset(system_prompt, progress=gr.Progress()):
34
- if system_prompt in default_system_prompts:
35
- index = default_system_prompts.index(system_prompt)
36
- if index < len(default_datasets):
37
- return default_datasets[index]
38
- if task == TEXTCAT_TASK:
39
- result = fn_generate_dataset(
40
- system_prompt=system_prompt,
41
- difficulty="high school",
42
- clarity="clear",
43
- labels=[],
44
- num_labels=1,
45
- num_rows=1,
46
- progress=progress,
47
- is_sample=True,
48
- )
49
- else:
50
- result = fn_generate_dataset(
51
- system_prompt=system_prompt,
52
- num_turns=1,
53
- num_rows=1,
54
- progress=progress,
55
- is_sample=True,
56
- )
57
- return result
58
-
59
- with gr.Blocks(
60
- title="🧬 Synthetic Data Generator",
61
- head="🧬 Synthetic Data Generator",
62
- css=_LOGGED_OUT_CSS,
63
- ) as app:
64
- with gr.Row():
65
- gr.HTML(
66
- """<details style='display: inline-block;'><summary><h2 style='display: inline;'>How does it work?</h2></summary><img src='https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/flow.png' width='100%' style='margin: 0 auto; display: block;'></details>"""
67
- )
68
- with gr.Row():
69
- gr.Markdown(
70
- "Want to run this locally or with other LLMs? Take a look at the FAQ tab. distilabel Synthetic Data Generator is free, we use the authentication token to push the dataset to the Hugging Face Hub and not for data generation."
71
- )
72
- with gr.Row():
73
- gr.Column()
74
- get_login_button()
75
- gr.Column()
76
-
77
- gr.Markdown("## Iterate on a sample dataset")
78
- with gr.Column() as main_ui:
79
- (
80
- dataset_description,
81
- examples,
82
- btn_generate_system_prompt,
83
- system_prompt,
84
- sample_dataset,
85
- btn_generate_sample_dataset,
86
- ) = get_iterate_on_sample_dataset_ui(
87
- default_dataset_descriptions=default_dataset_descriptions,
88
- default_system_prompts=default_system_prompts,
89
- default_datasets=default_datasets,
90
- task=task,
91
- )
92
- gr.Markdown("## Generate full dataset")
93
- gr.Markdown(
94
- "Once you're satisfied with the sample, generate a larger dataset and push it to Argilla or the Hugging Face Hub."
95
- )
96
- with gr.Row(variant="panel") as custom_input_ui:
97
- pass
98
-
99
- (
100
- dataset_name,
101
- add_to_existing_dataset,
102
- btn_generate_full_dataset_argilla,
103
- btn_generate_and_push_to_argilla,
104
- btn_push_to_argilla,
105
- org_name,
106
- repo_name,
107
- private,
108
- btn_generate_full_dataset,
109
- btn_generate_and_push_to_hub,
110
- btn_push_to_hub,
111
- final_dataset,
112
- success_message,
113
- ) = get_push_to_ui(default_datasets)
114
-
115
- sample_dataset.change(
116
- fn=lambda x: x,
117
- inputs=[sample_dataset],
118
- outputs=[final_dataset],
119
- )
120
-
121
- btn_generate_system_prompt.click(
122
- fn=fn_generate_system_prompt,
123
- inputs=[dataset_description],
124
- outputs=[system_prompt],
125
- show_progress=True,
126
- ).then(
127
- fn=fn_generate_sample_dataset,
128
- inputs=[system_prompt],
129
- outputs=[sample_dataset],
130
- show_progress=True,
131
- )
132
-
133
- btn_generate_sample_dataset.click(
134
- fn=fn_generate_sample_dataset,
135
- inputs=[system_prompt],
136
- outputs=[sample_dataset],
137
- show_progress=True,
138
- )
139
-
140
- app.load(fn=swap_visibility, outputs=main_ui)
141
- app.load(get_org_dropdown, outputs=[org_name])
142
-
143
- return (
144
- app,
145
- main_ui,
146
- custom_input_ui,
147
- dataset_description,
148
- examples,
149
- btn_generate_system_prompt,
150
- system_prompt,
151
- sample_dataset,
152
- btn_generate_sample_dataset,
153
- dataset_name,
154
- add_to_existing_dataset,
155
- btn_generate_full_dataset_argilla,
156
- btn_generate_and_push_to_argilla,
157
- btn_push_to_argilla,
158
- org_name,
159
- repo_name,
160
- private,
161
- btn_generate_full_dataset,
162
- btn_generate_and_push_to_hub,
163
- btn_push_to_hub,
164
- final_dataset,
165
- success_message,
166
- )
167
-
168
-
169
  def validate_argilla_user_workspace_dataset(
170
  dataset_name: str,
171
  add_to_existing_dataset: bool = True,
@@ -205,176 +58,6 @@ def get_org_dropdown(oauth_token: Union[OAuthToken, None]):
205
  )
206
 
207
 
208
- def get_push_to_ui(default_datasets):
209
- with gr.Column() as push_to_ui:
210
- (
211
- dataset_name,
212
- add_to_existing_dataset,
213
- btn_generate_full_dataset_argilla,
214
- btn_generate_and_push_to_argilla,
215
- btn_push_to_argilla,
216
- ) = get_argilla_tab()
217
- (
218
- org_name,
219
- repo_name,
220
- private,
221
- btn_generate_full_dataset,
222
- btn_generate_and_push_to_hub,
223
- btn_push_to_hub,
224
- ) = get_hf_tab()
225
- final_dataset = get_final_dataset_row(default_datasets)
226
- success_message = get_success_message_row()
227
- return (
228
- dataset_name,
229
- add_to_existing_dataset,
230
- btn_generate_full_dataset_argilla,
231
- btn_generate_and_push_to_argilla,
232
- btn_push_to_argilla,
233
- org_name,
234
- repo_name,
235
- private,
236
- btn_generate_full_dataset,
237
- btn_generate_and_push_to_hub,
238
- btn_push_to_hub,
239
- final_dataset,
240
- success_message,
241
- )
242
-
243
-
244
- def get_iterate_on_sample_dataset_ui(
245
- default_dataset_descriptions: List[str],
246
- default_system_prompts: List[str],
247
- default_datasets: List[pd.DataFrame],
248
- task: str,
249
- ):
250
- with gr.Column():
251
- dataset_description = gr.TextArea(
252
- label="Give a precise description of your desired application. Check the examples for inspiration.",
253
- value=default_dataset_descriptions[0],
254
- lines=2,
255
- )
256
- examples = gr.Examples(
257
- elem_id="system_prompt_examples",
258
- examples=[[example] for example in default_dataset_descriptions],
259
- inputs=[dataset_description],
260
- )
261
- with gr.Row():
262
- gr.Column(scale=1)
263
- btn_generate_system_prompt = gr.Button(
264
- value="Generate system prompt and sample dataset", variant="primary"
265
- )
266
- gr.Column(scale=1)
267
-
268
- system_prompt = gr.TextArea(
269
- label="System prompt for dataset generation. You can tune it and regenerate the sample.",
270
- value=default_system_prompts[0],
271
- lines=2 if task == TEXTCAT_TASK else 5,
272
- )
273
-
274
- with gr.Row():
275
- sample_dataset = gr.Dataframe(
276
- value=default_datasets[0],
277
- label=(
278
- "Sample dataset. Text truncated to 256 tokens."
279
- if task == TEXTCAT_TASK
280
- else "Sample dataset. Prompts and completions truncated to 256 tokens."
281
- ),
282
- interactive=False,
283
- wrap=True,
284
- )
285
-
286
- with gr.Row():
287
- gr.Column(scale=1)
288
- btn_generate_sample_dataset = gr.Button(
289
- value="Generate sample dataset", variant="primary"
290
- )
291
- gr.Column(scale=1)
292
-
293
- return (
294
- dataset_description,
295
- examples,
296
- btn_generate_system_prompt,
297
- system_prompt,
298
- sample_dataset,
299
- btn_generate_sample_dataset,
300
- )
301
-
302
-
303
- def get_argilla_tab() -> Tuple[Any]:
304
- with gr.Tab(label="Argilla"):
305
- if get_argilla_client() is not None:
306
- with gr.Row(variant="panel"):
307
- dataset_name = gr.Textbox(
308
- label="Dataset name",
309
- placeholder="dataset_name",
310
- value="my-distiset",
311
- )
312
- add_to_existing_dataset = gr.Checkbox(
313
- label="Allow adding records to existing dataset",
314
- info="When selected, you do need to ensure the dataset options are the same as in the existing dataset.",
315
- value=False,
316
- interactive=True,
317
- scale=1,
318
- )
319
-
320
- with gr.Row(variant="panel"):
321
- btn_generate_full_dataset_argilla = gr.Button(
322
- value="Generate", variant="primary", scale=2
323
- )
324
- btn_generate_and_push_to_argilla = gr.Button(
325
- value="Generate and Push to Argilla",
326
- variant="primary",
327
- scale=2,
328
- )
329
- btn_push_to_argilla = gr.Button(
330
- value="Push to Argilla", variant="primary", scale=2
331
- )
332
- else:
333
- gr.Markdown(
334
- "Please add `ARGILLA_API_URL` and `ARGILLA_API_KEY` to use Argilla or export the dataset to the Hugging Face Hub."
335
- )
336
- return (
337
- dataset_name,
338
- add_to_existing_dataset,
339
- btn_generate_full_dataset_argilla,
340
- btn_generate_and_push_to_argilla,
341
- btn_push_to_argilla,
342
- )
343
-
344
-
345
- def get_hf_tab() -> Tuple[Any]:
346
- with gr.Tab("Hugging Face Hub"):
347
- with gr.Row(variant="panel"):
348
- org_name = get_org_dropdown()
349
- repo_name = gr.Textbox(
350
- label="Repo name",
351
- placeholder="dataset_name",
352
- value="my-distiset",
353
- )
354
- private = gr.Checkbox(
355
- label="Private dataset",
356
- value=True,
357
- interactive=True,
358
- scale=1,
359
- )
360
- with gr.Row(variant="panel"):
361
- btn_generate_full_dataset = gr.Button(
362
- value="Generate", variant="primary", scale=2
363
- )
364
- btn_generate_and_push_to_hub = gr.Button(
365
- value="Generate and Push to Hub", variant="primary", scale=2
366
- )
367
- btn_push_to_hub = gr.Button(value="Push to Hub", variant="primary", scale=2)
368
- return (
369
- org_name,
370
- repo_name,
371
- private,
372
- btn_generate_full_dataset,
373
- btn_generate_and_push_to_hub,
374
- btn_push_to_hub,
375
- )
376
-
377
-
378
  def push_pipeline_code_to_hub(
379
  pipeline_code: str,
380
  org_name: str,
@@ -455,24 +138,6 @@ def validate_push_to_hub(org_name, repo_name):
455
  return repo_id
456
 
457
 
458
- def get_final_dataset_row(default_datasets) -> gr.Dataframe:
459
- with gr.Row():
460
- final_dataset = gr.Dataframe(
461
- value=default_datasets[0],
462
- label="Generated dataset",
463
- interactive=False,
464
- wrap=True,
465
- min_width=300,
466
- )
467
- return final_dataset
468
-
469
-
470
- def get_success_message_row() -> gr.Markdown:
471
- with gr.Row():
472
- success_message = gr.Markdown(visible=False)
473
- return success_message
474
-
475
-
476
  def show_success_message(org_name, repo_name) -> gr.Markdown:
477
  client = get_argilla_client()
478
  if client is None:
 
1
  import io
2
  import uuid
3
+ from typing import List, Union
4
 
5
  import argilla as rg
6
  import gradio as gr
 
11
  from huggingface_hub import HfApi, upload_file
12
 
13
  from distilabel_dataset_generator.utils import (
 
14
  get_argilla_client,
 
15
  list_orgs,
 
16
  )
17
 
18
  TEXTCAT_TASK = "text_classification"
19
  SFT_TASK = "supervised_fine_tuning"
20
 
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def validate_argilla_user_workspace_dataset(
23
  dataset_name: str,
24
  add_to_existing_dataset: bool = True,
 
58
  )
59
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  def push_pipeline_code_to_hub(
62
  pipeline_code: str,
63
  org_name: str,
 
138
  return repo_id
139
 
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  def show_success_message(org_name, repo_name) -> gr.Markdown:
142
  client = get_argilla_client()
143
  if client is None:
src/distilabel_dataset_generator/apps/sft.py CHANGED
@@ -28,7 +28,6 @@ from distilabel_dataset_generator.pipelines.sft import (
28
  get_response_generator,
29
  )
30
  from distilabel_dataset_generator.utils import (
31
- _LOGGED_OUT_CSS,
32
  get_argilla_client,
33
  get_org_dropdown,
34
  swap_visibility,
@@ -350,7 +349,7 @@ def hide_pipeline_code_visibility():
350
  ######################
351
 
352
 
353
- with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
354
  with gr.Column() as main_ui:
355
  if not SFT_AVAILABLE:
356
  gr.Markdown(
 
28
  get_response_generator,
29
  )
30
  from distilabel_dataset_generator.utils import (
 
31
  get_argilla_client,
32
  get_org_dropdown,
33
  swap_visibility,
 
349
  ######################
350
 
351
 
352
+ with gr.Blocks() as app:
353
  with gr.Column() as main_ui:
354
  if not SFT_AVAILABLE:
355
  gr.Markdown(
src/distilabel_dataset_generator/apps/textcat.py CHANGED
@@ -28,7 +28,6 @@ from src.distilabel_dataset_generator.pipelines.textcat import (
28
  get_textcat_generator,
29
  )
30
  from src.distilabel_dataset_generator.utils import (
31
- _LOGGED_OUT_CSS,
32
  get_argilla_client,
33
  get_org_dropdown,
34
  get_preprocess_labels,
@@ -332,7 +331,7 @@ def hide_pipeline_code_visibility():
332
  ######################
333
 
334
 
335
- with gr.Blocks(css=_LOGGED_OUT_CSS) as app:
336
  with gr.Column() as main_ui:
337
  gr.Markdown("## 1. Describe the dataset you want")
338
  with gr.Row():
 
28
  get_textcat_generator,
29
  )
30
  from src.distilabel_dataset_generator.utils import (
 
31
  get_argilla_client,
32
  get_org_dropdown,
33
  get_preprocess_labels,
 
331
  ######################
332
 
333
 
334
+ with gr.Blocks() as app:
335
  with gr.Column() as main_ui:
336
  gr.Markdown("## 1. Describe the dataset you want")
337
  with gr.Row():
src/distilabel_dataset_generator/utils.py CHANGED
@@ -6,10 +6,7 @@ import gradio as gr
6
  import numpy as np
7
  import pandas as pd
8
  from gradio.oauth import (
9
- OAUTH_CLIENT_ID,
10
- OAUTH_CLIENT_SECRET,
11
- OAUTH_SCOPES,
12
- OPENID_PROVIDER_URL,
13
  get_space,
14
  )
15
  from huggingface_hub import whoami
@@ -17,30 +14,6 @@ from jinja2 import Environment, meta
17
 
18
  from distilabel_dataset_generator.constants import argilla_client
19
 
20
- _LOGGED_OUT_CSS = ".main_ui_logged_out{opacity: 0.3; pointer-events: none}"
21
-
22
-
23
- _CHECK_IF_SPACE_IS_SET = (
24
- all(
25
- [
26
- OAUTH_CLIENT_ID,
27
- OAUTH_CLIENT_SECRET,
28
- OAUTH_SCOPES,
29
- OPENID_PROVIDER_URL,
30
- ]
31
- )
32
- or get_space() is None
33
- )
34
-
35
- if _CHECK_IF_SPACE_IS_SET:
36
- from gradio.oauth import OAuthToken
37
- else:
38
- OAuthToken = str
39
-
40
-
41
- def get_login_button():
42
- return gr.LoginButton(value="Sign in!", size="sm", scale=2).activate()
43
-
44
 
45
  def get_duplicate_button():
46
  if get_space() is not None:
@@ -85,13 +58,6 @@ def get_org_dropdown(oauth_token: Union[OAuthToken, None] = None):
85
  )
86
 
87
 
88
- def get_token(oauth_token: Union[OAuthToken, None]):
89
- if oauth_token:
90
- return oauth_token.token
91
- else:
92
- return ""
93
-
94
-
95
  def swap_visibility(oauth_token: Union[OAuthToken, None]):
96
  if oauth_token:
97
  return gr.update(elem_classes=["main_ui_logged_in"])
@@ -99,28 +65,6 @@ def swap_visibility(oauth_token: Union[OAuthToken, None]):
99
  return gr.update(elem_classes=["main_ui_logged_out"])
100
 
101
 
102
- def get_base_app():
103
- with gr.Blocks(
104
- title="🧬 Synthetic Data Generator",
105
- head="🧬 Synthetic Data Generator",
106
- css=_LOGGED_OUT_CSS,
107
- ) as app:
108
- with gr.Row():
109
- gr.Markdown(
110
- "Want to run this locally or with other LLMs? Take a look at the FAQ tab. distilabel Synthetic Data Generator is free, we use the authentication token to push the dataset to the Hugging Face Hub and not for data generation."
111
- )
112
- with gr.Row():
113
- gr.Column()
114
- get_login_button()
115
- gr.Column()
116
-
117
- gr.Markdown("## Iterate on a sample dataset")
118
- with gr.Column() as main_ui:
119
- pass
120
-
121
- return app
122
-
123
-
124
  def get_argilla_client() -> Union[rg.Argilla, None]:
125
  return argilla_client
126
 
 
6
  import numpy as np
7
  import pandas as pd
8
  from gradio.oauth import (
9
+ OAuthToken,
 
 
 
10
  get_space,
11
  )
12
  from huggingface_hub import whoami
 
14
 
15
  from distilabel_dataset_generator.constants import argilla_client
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  def get_duplicate_button():
19
  if get_space() is not None:
 
58
  )
59
 
60
 
 
 
 
 
 
 
 
61
  def swap_visibility(oauth_token: Union[OAuthToken, None]):
62
  if oauth_token:
63
  return gr.update(elem_classes=["main_ui_logged_in"])
 
65
  return gr.update(elem_classes=["main_ui_logged_out"])
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  def get_argilla_client() -> Union[rg.Argilla, None]:
69
  return argilla_client
70