Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

davidberenstein1957 commited on Dec 3, 2024

Commit

714b133

1 Parent(s): fd2f716

add buttons

Browse files

Files changed (6) hide show

README.md +1 -1
src/synthetic_dataset_generator/app.py +0 -12
src/synthetic_dataset_generator/apps/base.py +20 -13
src/synthetic_dataset_generator/apps/sft.py +25 -18
src/synthetic_dataset_generator/apps/textcat.py +20 -18
src/synthetic_dataset_generator/pipelines/eval.py +5 -5

README.md CHANGED Viewed

@@ -49,7 +49,7 @@ This tool simplifies the process of creating custom datasets, enabling you to:
 - Describe the characteristics of your desired application
 - Iterate on sample datasets
 - Produce full-scale datasets
-- Push your datasets to the [Hugging Face Hub](https://huggingface.co/datasets?other=datacraft) and/or Argilla
 By using the Synthetic Data Generator, you can rapidly prototype and create datasets for, accelerating your AI development process.

 - Describe the characteristics of your desired application
 - Iterate on sample datasets
 - Produce full-scale datasets
+- Push your datasets to the [Hugging Face Hub](https://huggingface.co/datasets?other=datacraft) and/or [Argilla](https://docs.argilla.io/)
 By using the Synthetic Data Generator, you can rapidly prototype and create datasets for, accelerating your AI development process.

src/synthetic_dataset_generator/app.py CHANGED Viewed

@@ -7,19 +7,7 @@ from synthetic_dataset_generator.apps.textcat import app as textcat_app
 theme = "argilla/argilla-theme"
 css = """
-button[role="tab"][aria-selected="true"] { border: 0; background: var(--neutral-800); color: white; border-top-right-radius: var(--radius-md); border-top-left-radius: var(--radius-md)}
-button[role="tab"][aria-selected="true"]:hover {border-color: var(--button-primary-background-fill)}
-.tabitem { border: 0; padding-inline: 0}
 .main_ui_logged_out{opacity: 0.3; pointer-events: none}
-.group_padding{padding: .55em}
-.gallery-item {background: var(--background-fill-secondary); text-align: left}
-.gallery {white-space: wrap}
-#space_model .wrap > label:last-child{opacity: 0.3; pointer-events:none}
-#system_prompt_examples {
-    color: var(--body-text-color) !important;
-    background-color: var(--block-background-fill) !important;
-}
-.container {padding-inline: 0 !important}
 """
 demo = TabbedInterface(

 theme = "argilla/argilla-theme"
 css = """
 .main_ui_logged_out{opacity: 0.3; pointer-events: none}
 """
 demo = TabbedInterface(

src/synthetic_dataset_generator/apps/base.py CHANGED Viewed

@@ -129,16 +129,18 @@ def show_success_message(org_name, repo_name) -> gr.Markdown:
     client = get_argilla_client()
     if client is None:
         return gr.Markdown(
-            value="""
-            <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
                 <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
                 <p style="margin-top: 0.5em;">
-                The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks. Your dataset is now available at:
-                <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
-                    https://huggingface.co/datasets/{org_name}/{repo_name}
-                    </a>
                 </p>
-                <p style="margin-top: 1em; font-size: 0.9em; color: #333;">
                     By configuring an `ARGILLA_API_URL` and `ARGILLA_API_KEY` you can curate the dataset in Argilla.
                     Unfamiliar with Argilla? Here are some docs to help you get started:
                     <br>• <a href="https://docs.argilla.io/latest/getting_started/quickstart/" target="_blank">How to get started with Argilla</a>
@@ -151,7 +153,7 @@ def show_success_message(org_name, repo_name) -> gr.Markdown:
     argilla_api_url = client.api_url
     return gr.Markdown(
         value=f"""
-        <div style="padding: 1em; background-color: #e6f3e6; border-radius: 5px; margin-top: 1em;">
             <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
             <p style="margin-top: 0.5em;">
                 <strong>
@@ -161,13 +163,18 @@ def show_success_message(org_name, repo_name) -> gr.Markdown:
                 </strong>
             </p>
             <p style="margin-top: 0.5em;">
-                The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks. Your dataset is now available at:
-                <a href="https://huggingface.co/datasets/{org_name}/{repo_name}" target="_blank" style="color: #1565c0; text-decoration: none;">
-                    https://huggingface.co/datasets/{org_name}/{repo_name}
-                </a>
             </p>
         </div>
-        <p style="margin-top: 1em; font-size: 0.9em; color: #333;">
             Unfamiliar with Argilla? Here are some docs to help you get started:
             <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
             <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>

     client = get_argilla_client()
     if client is None:
         return gr.Markdown(
+            value=f"""
+            <div style="padding: 1em; background-color: rgba(211, 211, 211, 0.5); border-radius: 5px; margin-top: 1em; color: inherit;">
                 <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
                 <p style="margin-top: 0.5em;">
+                The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
+                <div style="display: flex; gap: 10px;">
+                    <button class="lg primary svelte-cmf5ev" onclick="window.open('https://huggingface.co/datasets/{org_name}/{repo_name}', '_blank')" id="component-96">
+                        Open in Hub
+                    </button>
+                </div>
                 </p>
+                <p style="margin-top: 1em; color: #333;">
                     By configuring an `ARGILLA_API_URL` and `ARGILLA_API_KEY` you can curate the dataset in Argilla.
                     Unfamiliar with Argilla? Here are some docs to help you get started:
                     <br>• <a href="https://docs.argilla.io/latest/getting_started/quickstart/" target="_blank">How to get started with Argilla</a>
     argilla_api_url = client.api_url
     return gr.Markdown(
         value=f"""
+        <div style="padding: 1em; background-color: rgba(211, 211, 211, 0.5); border-radius: 5px; margin-top: 1em; color: inherit;">
             <h3 style="color: #2e7d32; margin: 0;">Dataset Published Successfully!</h3>
             <p style="margin-top: 0.5em;">
                 <strong>
                 </strong>
             </p>
             <p style="margin-top: 0.5em;">
+                The generated dataset is in the right format for fine-tuning with TRL, AutoTrain, or other frameworks.
+                <div style="display: flex; gap: 10px;">
+                    <button class="lg primary svelte-cmf5ev" onclick="window.open('https://huggingface.co/datasets/{org_name}/{repo_name}', '_blank')" id="component-95">
+                        Open in Argilla
+                    </button>
+                    <button class="lg secondary svelte-cmf5ev" onclick="window.open('https://huggingface.co/datasets/{org_name}/{repo_name}', '_blank')" id="component-96">
+                        Open in Hub
+                    </button>
+                </div>
             </p>
         </div>
+        <p style="margin-top: 1em; color: #333;">
             Unfamiliar with Argilla? Here are some docs to help you get started:
             <br>• <a href="https://docs.argilla.io/latest/how_to_guides/annotate/" target="_blank">How to curate data in Argilla</a>
             <br>• <a href="https://docs.argilla.io/latest/how_to_guides/import_export/" target="_blank">How to export data once you have reviewed the dataset</a>

src/synthetic_dataset_generator/apps/sft.py CHANGED Viewed

@@ -363,28 +363,22 @@ with gr.Blocks() as app:
                         label="Dataset description",
                         placeholder="Give a precise description of your desired dataset.",
                     )
-                    with gr.Accordion("Temperature", open=False):
-                        temperature = gr.Slider(
-                            minimum=0.1,
-                            maximum=1,
-                            value=0.8,
-                            step=0.1,
-                            interactive=True,
-                            show_label=False,
                         )
-                    load_btn = gr.Button(
-                        "Create dataset",
-                        variant="primary",
-                    )
-                with gr.Column(scale=2):
                     examples = gr.Examples(
                         examples=DEFAULT_DATASET_DESCRIPTIONS,
                         inputs=[dataset_description],
                         cache_examples=False,
                         label="Examples",
                     )
-                with gr.Column(scale=1):
-                    pass
             gr.HTML(value="<hr>")
             gr.Markdown(value="## 2. Configure your dataset")
@@ -403,9 +397,14 @@ with gr.Blocks() as app:
                         interactive=True,
                         info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
                     )
-                    btn_apply_to_sample_dataset = gr.Button(
-                        "Refresh dataset", variant="secondary"
-                    )
                 with gr.Column(scale=3):
                     dataframe = gr.Dataframe(
                         headers=["prompt", "completion"],
@@ -431,6 +430,14 @@ with gr.Blocks() as app:
                         interactive=True,
                         scale=1,
                     )
                     private = gr.Checkbox(
                         label="Private dataset",
                         value=False,

                         label="Dataset description",
                         placeholder="Give a precise description of your desired dataset.",
                     )
+                    with gr.Row():
+                        load_btn = gr.Button(
+                            "Create",
+                            variant="primary",
                         )
+                        clear_btn = gr.Button(
+                            "Clear",
+                            variant="secondary",
+                        )
+                with gr.Column(scale=3):
                     examples = gr.Examples(
                         examples=DEFAULT_DATASET_DESCRIPTIONS,
                         inputs=[dataset_description],
                         cache_examples=False,
                         label="Examples",
                     )
             gr.HTML(value="<hr>")
             gr.Markdown(value="## 2. Configure your dataset")
                         interactive=True,
                         info="Choose between 1 (single turn with 'instruction-response' columns) and 2-4 (multi-turn conversation with a 'messages' column).",
                     )
+                    with gr.Row():
+                        btn_apply_to_sample_dataset = gr.Button(
+                            "Save", variant="primary"
+                        )
+                        clear_btn = gr.Button(
+                            "Clear",
+                            variant="secondary",
+                        )
                 with gr.Column(scale=3):
                     dataframe = gr.Dataframe(
                         headers=["prompt", "completion"],
                         interactive=True,
                         scale=1,
                     )
+                    temperature = gr.Slider(
+                        minimum=0.1,
+                        maximum=1,
+                        value=0.8,
+                        step=0.1,
+                        interactive=True,
+                        show_label=False,
+                    )
                     private = gr.Checkbox(
                         label="Private dataset",
                         value=False,

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -340,28 +340,22 @@ with gr.Blocks() as app:
                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
-                with gr.Accordion("Temperature", open=False):
-                    temperature = gr.Slider(
-                        minimum=0.1,
-                        maximum=1,
-                        value=0.8,
-                        step=0.1,
-                        interactive=True,
-                        show_label=False,
                     )
-                load_btn = gr.Button(
-                    "Create dataset",
-                    variant="primary",
-                )
-            with gr.Column(scale=2):
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
                     label="Examples",
                 )
-            with gr.Column(scale=1):
-                pass
         gr.HTML("<hr>")
         gr.Markdown("## 2. Configure your dataset")
@@ -415,9 +409,9 @@ with gr.Blocks() as app:
                     info="Select the comprehension level for the text. Ensure it matches the task context.",
                     interactive=True,
                 )
-                btn_apply_to_sample_dataset = gr.Button(
-                    "Refresh dataset", variant="secondary"
-                )
             with gr.Column(scale=3):
                 dataframe = gr.Dataframe(
                     headers=["labels", "text"], wrap=True, height=500, interactive=False
@@ -440,6 +434,14 @@ with gr.Blocks() as app:
                     interactive=True,
                     scale=1,
                 )
                 private = gr.Checkbox(
                     label="Private dataset",
                     value=False,

                     label="Dataset description",
                     placeholder="Give a precise description of your desired dataset.",
                 )
+                with gr.Row():
+                    load_btn = gr.Button(
+                        "Create",
+                        variant="primary",
                     )
+                    clear_btn = gr.Button(
+                        "Clear",
+                        variant="secondary",
+                    )
+            with gr.Column(scale=3):
                 examples = gr.Examples(
                     examples=DEFAULT_DATASET_DESCRIPTIONS,
                     inputs=[dataset_description],
                     cache_examples=False,
                     label="Examples",
                 )
         gr.HTML("<hr>")
         gr.Markdown("## 2. Configure your dataset")
                     info="Select the comprehension level for the text. Ensure it matches the task context.",
                     interactive=True,
                 )
+                with gr.Row():
+                    btn_apply_to_sample_dataset = gr.Button("Save", variant="primary")
+                    clear_btn = gr.Button("Clear", variant="secondary")
             with gr.Column(scale=3):
                 dataframe = gr.Dataframe(
                     headers=["labels", "text"], wrap=True, height=500, interactive=False
                     interactive=True,
                     scale=1,
                 )
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=1,
+                    value=0.8,
+                    step=0.1,
+                    interactive=True,
+                    show_label=False,
+                )
                 private = gr.Checkbox(
                     label="Private dataset",
                     value=False,

src/synthetic_dataset_generator/pipelines/eval.py CHANGED Viewed

@@ -17,7 +17,7 @@ def get_ultrafeedback_evaluator(aspect, is_sample):
             base_url=BASE_URL,
             api_key=_get_next_api_key(),
             generation_kwargs={
-                "temperature": 0,
                 "max_new_tokens": 256 if is_sample else 2048,
             },
         ),
@@ -35,7 +35,7 @@ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample)
             api_key=_get_next_api_key(),
             structured_output={"format": "json", "schema": structured_output},
             generation_kwargs={
-                "temperature": 0,
                 "max_new_tokens": 256 if is_sample else 2048,
             },
         ),
@@ -78,7 +78,7 @@ with Pipeline(name="ultrafeedback") as pipeline:
             base_url=BASE_URL,
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
-                "temperature": 0,
                 "max_new_tokens": 2048,
             }},
         ),
@@ -122,7 +122,7 @@ with Pipeline(name="ultrafeedback") as pipeline:
                 base_url=BASE_URL,
                 api_key=os.environ["BASE_URL"],
                 generation_kwargs={{
-                    "temperature": 0,
                     "max_new_tokens": 2048,
                 }},
             output_mappings={{
@@ -176,7 +176,7 @@ with Pipeline(name="custom-evaluation") as pipeline:
             api_key=os.environ["HF_TOKEN"],
             structured_output={{"format": "json", "schema": {structured_output}}},
             generation_kwargs={{
-                "temperature": 0,
                 "max_new_tokens": 2048,
             }},
         ),

             base_url=BASE_URL,
             api_key=_get_next_api_key(),
             generation_kwargs={
+                "temperature": 0.01,
                 "max_new_tokens": 256 if is_sample else 2048,
             },
         ),
             api_key=_get_next_api_key(),
             structured_output={"format": "json", "schema": structured_output},
             generation_kwargs={
+                "temperature": 0.01,
                 "max_new_tokens": 256 if is_sample else 2048,
             },
         ),
             base_url=BASE_URL,
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
+                "temperature": 0.01,
                 "max_new_tokens": 2048,
             }},
         ),
                 base_url=BASE_URL,
                 api_key=os.environ["BASE_URL"],
                 generation_kwargs={{
+                    "temperature": 0.01,
                     "max_new_tokens": 2048,
                 }},
             output_mappings={{
             api_key=os.environ["HF_TOKEN"],
             structured_output={{"format": "json", "schema": {structured_output}}},
             generation_kwargs={{
+                "temperature": 0.01,
                 "max_new_tokens": 2048,
             }},
         ),