davidberenstein1957 HF staff commited on
Commit
714b133
1 Parent(s): fd2f716

add buttons

Browse files
README.md CHANGED
@@ -49,7 +49,7 @@ This tool simplifies the process of creating custom datasets, enabling you to:
49
  - Describe the characteristics of your desired application
50
  - Iterate on sample datasets
51
  - Produce full-scale datasets
52
- - Push your datasets to the [Hugging Face Hub](https://huggingface.co/datasets?other=datacraft) and/or Argilla
53
 
54
  By using the Synthetic Data Generator, you can rapidly prototype and create datasets for, accelerating your AI development process.
55
 
 
49
  - Describe the characteristics of your desired application
50
  - Iterate on sample datasets
51
  - Produce full-scale datasets
52
+ - Push your datasets to the [Hugging Face Hub](https://huggingface.co/datasets?other=datacraft) and/or [Argilla](https://docs.argilla.io/)
53
 
54
  By using the Synthetic Data Generator, you can rapidly prototype and create datasets for, accelerating your AI development process.
55
 
src/synthetic_dataset_generator/app.py CHANGED
@@ -7,19 +7,7 @@ from synthetic_dataset_generator.apps.textcat import app as textcat_app
7
  theme = "argilla/argilla-theme"
8
 
9
  css = """
10
- button[role="tab"][aria-selected="true"] { border: 0; background: var(--neutral-800); color: white; border-top-right-radius: var(--radius-md); border-top-left-radius: var(--radius-md)}
11
- button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-primary-background-fill)}
12
- .tabitem { border: 0; padding-inline: 0}
13
  .main_ui_logged_out{opacity: 0.3; pointer-events: none}
14
- .group_padding{padding: .55em}
15
- .gallery-item {background: var(--background-fill-secondary); text-align: left}
16
- .gallery {white-space: wrap}
17
- #space_model .wrap > label:last-child{opacity: 0.3; pointer-events:none}
18
- #system_prompt_examples {
19
- color: var(--body-text-color) !important;
20
- background-color: var(--block-background-fill) !important;
21
- }
22
- .container {padding-inline: 0 !important}
23
  """
24
 
25
  demo = TabbedInterface(
 
7
  theme = "argilla/argilla-theme"
8
 
9
  css = """
 
 
 
10
  .main_ui_logged_out{opacity: 0.3; pointer-events: none}
 
 
 
 
 
 
 
 
 
11
  """
12
 
13
  demo = TabbedInterface(
src/synthetic_dataset_generator/apps/base.py CHANGED
@@ -129,16 +129,18 @@ def show_success_message(org_name, repo_name) -> gr.Markdown:
129
  client = get_argilla_client()
130
  if client is None:
131
  return gr.Markdown(
132
- value="""
133
- <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
134
  <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
135
  <p style="margin-top: 0.5em;">
136
- The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks. Your dataset is now available at:
137
- <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
138
- https://huggingface.co/datasets/{org_name}/{repo_name}
139
- </a>
 
 
140
  </p>
141
- <p style="margin-top: 1em; font-size: 0.9em; color: #333;">
142
  By configuring an `ARGILLA_API_URL` and `ARGILLA_API_KEY` you can curate the dataset in Argilla.
143
  Unfamiliar with Argilla? Here are some docs to help you get started:
144
  <br>• <a href="https://docs.argilla.io/latest/getting_started/quickstart/" target="_blank">How to get started with Argilla</a>
@@ -151,7 +153,7 @@ def show_success_message(org_name, repo_name) -> gr.Markdown:
151
  argilla_api_url = client.api_url
152
  return gr.Markdown(
153
  value=f"""
154
- <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
155
  <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
156
  <p style="margin-top: 0.5em;">
157
  <strong>
@@ -161,13 +163,18 @@ def show_success_message(org_name, repo_name) -> gr.Markdown:
161
  </strong>
162
  </p>
163
  <p style="margin-top: 0.5em;">
164
- The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks. Your dataset is now available at:
165
- <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
166
- https://huggingface.co/datasets/{org_name}/{repo_name}
167
- </a>
 
 
 
 
 
168
  </p>
169
  </div>
170
- <p style="margin-top: 1em; font-size: 0.9em; color: #333;">
171
  Unfamiliar with Argilla? Here are some docs to help you get started:
172
  <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
173
  <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
 
129
  client = get_argilla_client()
130
  if client is None:
131
  return gr.Markdown(
132
+ value=f"""
133
+ <div style="padding: 1em; background-color: rgba(211, 211, 211, 0.5); border-radius: 5px; margin-top: 1em; color: inherit;">
134
  <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
135
  <p style="margin-top: 0.5em;">
136
+ The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
137
+ <div style="display: flex; gap: 10px;">
138
+ <button class="lg primary svelte-cmf5ev" onclick="window.open('https://huggingface.co/datasets/{org_name}/{repo_name}', '_blank')" id="component-96">
139
+ Open in Hub
140
+ </button>
141
+ </div>
142
  </p>
143
+ <p style="margin-top: 1em; color: #333;">
144
  By configuring an `ARGILLA_API_URL` and `ARGILLA_API_KEY` you can curate the dataset in Argilla.
145
  Unfamiliar with Argilla? Here are some docs to help you get started:
146
  <br>• <a href="https://docs.argilla.io/latest/getting_started/quickstart/" target="_blank">How to get started with Argilla</a>
 
153
  argilla_api_url = client.api_url
154
  return gr.Markdown(
155
  value=f"""
156
+ <div style="padding: 1em; background-color: rgba(211, 211, 211, 0.5); border-radius: 5px; margin-top: 1em; color: inherit;">
157
  <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
158
  <p style="margin-top: 0.5em;">
159
  <strong>
 
163
  </strong>
164
  </p>
165
  <p style="margin-top: 0.5em;">
166
+ The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
167
+ <div style="display: flex; gap: 10px;">
168
+ <button class="lg primary svelte-cmf5ev" onclick="window.open('https://huggingface.co/datasets/{org_name}/{repo_name}', '_blank')" id="component-95">
169
+ Open in Argilla
170
+ </button>
171
+ <button class="lg secondary svelte-cmf5ev" onclick="window.open('https://huggingface.co/datasets/{org_name}/{repo_name}', '_blank')" id="component-96">
172
+ Open in Hub
173
+ </button>
174
+ </div>
175
  </p>
176
  </div>
177
+ <p style="margin-top: 1em; color: #333;">
178
  Unfamiliar with Argilla? Here are some docs to help you get started:
179
  <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
180
  <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>
src/synthetic_dataset_generator/apps/sft.py CHANGED
@@ -363,28 +363,22 @@ with gr.Blocks() as app:
363
  label="Dataset description",
364
  placeholder="Give a precise description of your desired dataset.",
365
  )
366
- with gr.Accordion("Temperature", open=False):
367
- temperature = gr.Slider(
368
- minimum=0.1,
369
- maximum=1,
370
- value=0.8,
371
- step=0.1,
372
- interactive=True,
373
- show_label=False,
374
  )
375
- load_btn = gr.Button(
376
- "Create dataset",
377
- variant="primary",
378
- )
379
- with gr.Column(scale=2):
380
  examples = gr.Examples(
381
  examples=DEFAULT_DATASET_DESCRIPTIONS,
382
  inputs=[dataset_description],
383
  cache_examples=False,
384
  label="Examples",
385
  )
386
- with gr.Column(scale=1):
387
- pass
388
 
389
  gr.HTML(value="<hr>")
390
  gr.Markdown(value="## 2. Configure your dataset")
@@ -403,9 +397,14 @@ with gr.Blocks() as app:
403
  interactive=True,
404
  info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
405
  )
406
- btn_apply_to_sample_dataset = gr.Button(
407
- "Refresh dataset", variant="secondary"
408
- )
 
 
 
 
 
409
  with gr.Column(scale=3):
410
  dataframe = gr.Dataframe(
411
  headers=["prompt", "completion"],
@@ -431,6 +430,14 @@ with gr.Blocks() as app:
431
  interactive=True,
432
  scale=1,
433
  )
 
 
 
 
 
 
 
 
434
  private = gr.Checkbox(
435
  label="Private dataset",
436
  value=False,
 
363
  label="Dataset description",
364
  placeholder="Give a precise description of your desired dataset.",
365
  )
366
+ with gr.Row():
367
+ load_btn = gr.Button(
368
+ "Create",
369
+ variant="primary",
 
 
 
 
370
  )
371
+ clear_btn = gr.Button(
372
+ "Clear",
373
+ variant="secondary",
374
+ )
375
+ with gr.Column(scale=3):
376
  examples = gr.Examples(
377
  examples=DEFAULT_DATASET_DESCRIPTIONS,
378
  inputs=[dataset_description],
379
  cache_examples=False,
380
  label="Examples",
381
  )
 
 
382
 
383
  gr.HTML(value="<hr>")
384
  gr.Markdown(value="## 2. Configure your dataset")
 
397
  interactive=True,
398
  info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
399
  )
400
+ with gr.Row():
401
+ btn_apply_to_sample_dataset = gr.Button(
402
+ "Save", variant="primary"
403
+ )
404
+ clear_btn = gr.Button(
405
+ "Clear",
406
+ variant="secondary",
407
+ )
408
  with gr.Column(scale=3):
409
  dataframe = gr.Dataframe(
410
  headers=["prompt", "completion"],
 
430
  interactive=True,
431
  scale=1,
432
  )
433
+ temperature = gr.Slider(
434
+ minimum=0.1,
435
+ maximum=1,
436
+ value=0.8,
437
+ step=0.1,
438
+ interactive=True,
439
+ show_label=False,
440
+ )
441
  private = gr.Checkbox(
442
  label="Private dataset",
443
  value=False,
src/synthetic_dataset_generator/apps/textcat.py CHANGED
@@ -340,28 +340,22 @@ with gr.Blocks() as app:
340
  label="Dataset description",
341
  placeholder="Give a precise description of your desired dataset.",
342
  )
343
- with gr.Accordion("Temperature", open=False):
344
- temperature = gr.Slider(
345
- minimum=0.1,
346
- maximum=1,
347
- value=0.8,
348
- step=0.1,
349
- interactive=True,
350
- show_label=False,
351
  )
352
- load_btn = gr.Button(
353
- "Create dataset",
354
- variant="primary",
355
- )
356
- with gr.Column(scale=2):
357
  examples = gr.Examples(
358
  examples=DEFAULT_DATASET_DESCRIPTIONS,
359
  inputs=[dataset_description],
360
  cache_examples=False,
361
  label="Examples",
362
  )
363
- with gr.Column(scale=1):
364
- pass
365
 
366
  gr.HTML("<hr>")
367
  gr.Markdown("## 2. Configure your dataset")
@@ -415,9 +409,9 @@ with gr.Blocks() as app:
415
  info="Select the comprehension level for the text. Ensure it matches the task context.",
416
  interactive=True,
417
  )
418
- btn_apply_to_sample_dataset = gr.Button(
419
- "Refresh dataset", variant="secondary"
420
- )
421
  with gr.Column(scale=3):
422
  dataframe = gr.Dataframe(
423
  headers=["labels", "text"], wrap=True, height=500, interactive=False
@@ -440,6 +434,14 @@ with gr.Blocks() as app:
440
  interactive=True,
441
  scale=1,
442
  )
 
 
 
 
 
 
 
 
443
  private = gr.Checkbox(
444
  label="Private dataset",
445
  value=False,
 
340
  label="Dataset description",
341
  placeholder="Give a precise description of your desired dataset.",
342
  )
343
+ with gr.Row():
344
+ load_btn = gr.Button(
345
+ "Create",
346
+ variant="primary",
 
 
 
 
347
  )
348
+ clear_btn = gr.Button(
349
+ "Clear",
350
+ variant="secondary",
351
+ )
352
+ with gr.Column(scale=3):
353
  examples = gr.Examples(
354
  examples=DEFAULT_DATASET_DESCRIPTIONS,
355
  inputs=[dataset_description],
356
  cache_examples=False,
357
  label="Examples",
358
  )
 
 
359
 
360
  gr.HTML("<hr>")
361
  gr.Markdown("## 2. Configure your dataset")
 
409
  info="Select the comprehension level for the text. Ensure it matches the task context.",
410
  interactive=True,
411
  )
412
+ with gr.Row():
413
+ btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
414
+ clear_btn = gr.Button("Clear", variant="secondary")
415
  with gr.Column(scale=3):
416
  dataframe = gr.Dataframe(
417
  headers=["labels", "text"], wrap=True, height=500, interactive=False
 
434
  interactive=True,
435
  scale=1,
436
  )
437
+ temperature = gr.Slider(
438
+ minimum=0.1,
439
+ maximum=1,
440
+ value=0.8,
441
+ step=0.1,
442
+ interactive=True,
443
+ show_label=False,
444
+ )
445
  private = gr.Checkbox(
446
  label="Private dataset",
447
  value=False,
src/synthetic_dataset_generator/pipelines/eval.py CHANGED
@@ -17,7 +17,7 @@ def get_ultrafeedback_evaluator(aspect, is_sample):
17
  base_url=BASE_URL,
18
  api_key=_get_next_api_key(),
19
  generation_kwargs={
20
- "temperature": 0,
21
  "max_new_tokens": 256 if is_sample else 2048,
22
  },
23
  ),
@@ -35,7 +35,7 @@ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample)
35
  api_key=_get_next_api_key(),
36
  structured_output={"format": "json", "schema": structured_output},
37
  generation_kwargs={
38
- "temperature": 0,
39
  "max_new_tokens": 256 if is_sample else 2048,
40
  },
41
  ),
@@ -78,7 +78,7 @@ with Pipeline(name="ultrafeedback") as pipeline:
78
  base_url=BASE_URL,
79
  api_key=os.environ["API_KEY"],
80
  generation_kwargs={{
81
- "temperature": 0,
82
  "max_new_tokens": 2048,
83
  }},
84
  ),
@@ -122,7 +122,7 @@ with Pipeline(name="ultrafeedback") as pipeline:
122
  base_url=BASE_URL,
123
  api_key=os.environ["BASE_URL"],
124
  generation_kwargs={{
125
- "temperature": 0,
126
  "max_new_tokens": 2048,
127
  }},
128
  output_mappings={{
@@ -176,7 +176,7 @@ with Pipeline(name="custom-evaluation") as pipeline:
176
  api_key=os.environ["HF_TOKEN"],
177
  structured_output={{"format": "json", "schema": {structured_output}}},
178
  generation_kwargs={{
179
- "temperature": 0,
180
  "max_new_tokens": 2048,
181
  }},
182
  ),
 
17
  base_url=BASE_URL,
18
  api_key=_get_next_api_key(),
19
  generation_kwargs={
20
+ "temperature": 0.01,
21
  "max_new_tokens": 256 if is_sample else 2048,
22
  },
23
  ),
 
35
  api_key=_get_next_api_key(),
36
  structured_output={"format": "json", "schema": structured_output},
37
  generation_kwargs={
38
+ "temperature": 0.01,
39
  "max_new_tokens": 256 if is_sample else 2048,
40
  },
41
  ),
 
78
  base_url=BASE_URL,
79
  api_key=os.environ["API_KEY"],
80
  generation_kwargs={{
81
+ "temperature": 0.01,
82
  "max_new_tokens": 2048,
83
  }},
84
  ),
 
122
  base_url=BASE_URL,
123
  api_key=os.environ["BASE_URL"],
124
  generation_kwargs={{
125
+ "temperature": 0.01,
126
  "max_new_tokens": 2048,
127
  }},
128
  output_mappings={{
 
176
  api_key=os.environ["HF_TOKEN"],
177
  structured_output={{"format": "json", "schema": {structured_output}}},
178
  generation_kwargs={{
179
+ "temperature": 0.01,
180
  "max_new_tokens": 2048,
181
  }},
182
  ),