Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

davidberenstein1957 commited on Dec 11, 2024

Commit

7b7c1be

1 Parent(s): dc56474

add MAX_NUM_ROWS

Browse files

Files changed (9) hide show

README.md +6 -0
src/synthetic_dataset_generator/apps/base.py +10 -0
src/synthetic_dataset_generator/apps/eval.py +2 -0
src/synthetic_dataset_generator/apps/sft.py +2 -0
src/synthetic_dataset_generator/apps/textcat.py +2 -0
src/synthetic_dataset_generator/constants.py +3 -1
src/synthetic_dataset_generator/pipelines/eval.py +6 -6
src/synthetic_dataset_generator/pipelines/sft.py +7 -6
src/synthetic_dataset_generator/pipelines/textcat.py +6 -6

README.md CHANGED Viewed

@@ -79,6 +79,12 @@ demo.launch()
 Optionally, you can set the following environment variables to customize the generation process.
 - `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`.
 - `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`.
 - `API_KEY`: The API key to use for the corresponding API, e.g. `hf_...`, `sk-...`.

 Optionally, you can set the following environment variables to customize the generation process.
+- `MAX_NUM_TOKENS`: The maximum number of tokens to generate, defaults to `2048`.
+- `MAX_NUM_ROWS`: The maximum number of rows to generate, defaults to `1000`.
+- `DEFAULT_BATCH_SIZE`: The default batch size to use for generating the dataset, defaults to `5`.
+Optionally, you can use different models and APIs.
 - `BASE_URL`: The base URL for any OpenAI compatible API, e.g. `https://api-inference.huggingface.co/v1/`, `https://api.openai.com/v1/`.
 - `MODEL`: The model to use for generating the dataset, e.g. `meta-llama/Meta-Llama-3.1-8B-Instruct`, `gpt-4o`.
 - `API_KEY`: The API key to use for the corresponding API, e.g. `hf_...`, `sk-...`.

src/synthetic_dataset_generator/apps/base.py CHANGED Viewed

@@ -8,6 +8,7 @@ from datasets import Dataset, concatenate_datasets, load_dataset
 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
 from synthetic_dataset_generator.utils import get_argilla_client
@@ -136,3 +137,12 @@ def show_success_message(org_name, repo_name) -> gr.Markdown:
 def hide_success_message() -> gr.Markdown:
     return gr.Markdown(value="")

 from gradio import OAuthToken
 from huggingface_hub import HfApi, upload_file
+from synthetic_dataset_generator.constants import MAX_NUM_ROWS
 from synthetic_dataset_generator.utils import get_argilla_client
 def hide_success_message() -> gr.Markdown:
     return gr.Markdown(value="")
+def test_max_num_rows(num_rows: int) -> int:
+    if num_rows > MAX_NUM_ROWS:
+        num_rows = MAX_NUM_ROWS
+        gr.Info(
+            f"Number of rows is larger than the configured maximum. Setting number of rows to {MAX_NUM_ROWS}. Set environment variable `MAX_NUM_ROWS` to change this behavior."
+        )
+    return num_rows

src/synthetic_dataset_generator/apps/eval.py CHANGED Viewed

@@ -22,6 +22,7 @@ from synthetic_dataset_generator.apps.base import (
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
@@ -303,6 +304,7 @@ def _evaluate_dataset(
     num_rows: int = 10,
     is_sample: bool = False,
 ):
     if eval_type == "chat-eval":
         dataframe = evaluate_instruction_response(
             dataframe=dataframe,

     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
+    test_max_num_rows,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
     num_rows: int = 10,
     is_sample: bool = False,
 ):
+    num_rows = test_max_num_rows(num_rows)
     if eval_type == "chat-eval":
         dataframe = evaluate_instruction_response(
             dataframe=dataframe,

src/synthetic_dataset_generator/apps/sft.py CHANGED Viewed

@@ -14,6 +14,7 @@ from synthetic_dataset_generator.apps.base import (
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
@@ -100,6 +101,7 @@ def generate_dataset(
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating instructions")
     magpie_generator = get_magpie_generator(
         system_prompt, num_turns, temperature, is_sample

     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
+    test_max_num_rows,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
+    num_rows = test_max_num_rows(num_rows)
     progress(0.0, desc="(1/2) Generating instructions")
     magpie_generator = get_magpie_generator(
         system_prompt, num_turns, temperature, is_sample

src/synthetic_dataset_generator/apps/textcat.py CHANGED Viewed

@@ -15,6 +15,7 @@ from src.synthetic_dataset_generator.apps.base import (
     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
@@ -94,6 +95,7 @@ def generate_dataset(
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
     progress(0.0, desc="(1/2) Generating dataset")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(

     hide_success_message,
     push_pipeline_code_to_hub,
     show_success_message,
+    test_max_num_rows,
     validate_argilla_user_workspace_dataset,
     validate_push_to_hub,
 )
     is_sample: bool = False,
     progress=gr.Progress(),
 ) -> pd.DataFrame:
+    num_rows = test_max_num_rows(num_rows)
     progress(0.0, desc="(1/2) Generating dataset")
     labels = get_preprocess_labels(labels)
     textcat_generator = get_textcat_generator(

src/synthetic_dataset_generator/constants.py CHANGED Viewed

@@ -15,7 +15,9 @@ if HF_TOKEN is None:
     )
 # Inference
-DEFAULT_BATCH_SIZE = 5
 MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
 API_KEYS = (
     [os.getenv("HF_TOKEN")]

     )
 # Inference
+MAX_NUM_TOKENS = os.getenv("MAX_NUM_TOKENS", 2048)
+MAX_NUM_ROWS: str | int = os.getenv("MAX_NUM_ROWS", 1000)
+DEFAULT_BATCH_SIZE = os.getenv("DEFAULT_BATCH_SIZE", 5)
 MODEL = os.getenv("MODEL", "meta-llama/Meta-Llama-3.1-8B-Instruct")
 API_KEYS = (
     [os.getenv("HF_TOKEN")]

src/synthetic_dataset_generator/pipelines/eval.py CHANGED Viewed

@@ -5,7 +5,7 @@ from distilabel.steps.tasks import (
     UltraFeedback,
 )
-from synthetic_dataset_generator.constants import BASE_URL, MODEL
 from synthetic_dataset_generator.pipelines.base import _get_next_api_key
 from synthetic_dataset_generator.utils import extract_column_names
@@ -18,7 +18,7 @@ def get_ultrafeedback_evaluator(aspect, is_sample):
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.01,
-                "max_new_tokens": 2048 if not is_sample else 512,
             },
         ),
         aspect=aspect,
@@ -36,7 +36,7 @@ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample)
             structured_output={"format": "json", "schema": structured_output},
             generation_kwargs={
                 "temperature": 0.01,
-                "max_new_tokens": 2048 if not is_sample else 512,
             },
         ),
         template=prompt_template,
@@ -79,7 +79,7 @@ with Pipeline(name="ultrafeedback") as pipeline:
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
                 "temperature": 0.01,
-                "max_new_tokens": 2048,
             }},
         ),
         aspect=aspect,
@@ -123,7 +123,7 @@ with Pipeline(name="ultrafeedback") as pipeline:
                 api_key=os.environ["BASE_URL"],
                 generation_kwargs={{
                     "temperature": 0.01,
-                    "max_new_tokens": 2048,
                 }},
             output_mappings={{
                 "ratings": f"ratings_{{aspect}}",
@@ -177,7 +177,7 @@ with Pipeline(name="custom-evaluation") as pipeline:
             structured_output={{"format": "json", "schema": {structured_output}}},
             generation_kwargs={{
                 "temperature": 0.01,
-                "max_new_tokens": 2048,
             }},
         ),
         template=CUSTOM_TEMPLATE,

     UltraFeedback,
 )
+from synthetic_dataset_generator.constants import BASE_URL, MAX_NUM_TOKENS, MODEL
 from synthetic_dataset_generator.pipelines.base import _get_next_api_key
 from synthetic_dataset_generator.utils import extract_column_names
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.01,
+                "max_new_tokens": MAX_NUM_TOKENS if not is_sample else 512,
             },
         ),
         aspect=aspect,
             structured_output={"format": "json", "schema": structured_output},
             generation_kwargs={
                 "temperature": 0.01,
+                "max_new_tokens": MAX_NUM_TOKENS if not is_sample else 512,
             },
         ),
         template=prompt_template,
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
                 "temperature": 0.01,
+                "max_new_tokens": {MAX_NUM_TOKENS},
             }},
         ),
         aspect=aspect,
                 api_key=os.environ["BASE_URL"],
                 generation_kwargs={{
                     "temperature": 0.01,
+                    "max_new_tokens": {MAX_NUM_TOKENS},
                 }},
             output_mappings={{
                 "ratings": f"ratings_{{aspect}}",
             structured_output={{"format": "json", "schema": {structured_output}}},
             generation_kwargs={{
                 "temperature": 0.01,
+                "max_new_tokens": {MAX_NUM_TOKENS},
             }},
         ),
         template=CUSTOM_TEMPLATE,

src/synthetic_dataset_generator/pipelines/sft.py CHANGED Viewed

@@ -4,6 +4,7 @@ from distilabel.steps.tasks import ChatGeneration, Magpie, TextGeneration
 from synthetic_dataset_generator.constants import (
     BASE_URL,
     MAGPIE_PRE_QUERY_TEMPLATE,
     MODEL,
 )
 from synthetic_dataset_generator.pipelines.base import _get_next_api_key
@@ -149,7 +150,7 @@ def get_prompt_generator():
             base_url=BASE_URL,
             generation_kwargs={
                 "temperature": 0.8,
-                "max_new_tokens": 2048,
                 "do_sample": True,
             },
         ),
@@ -174,7 +175,7 @@ def get_magpie_generator(system_prompt, num_turns, temperature, is_sample):
                 generation_kwargs={
                     "temperature": temperature,
                     "do_sample": True,
-                    "max_new_tokens": 256 if is_sample else 512,
                     "stop_sequences": _STOP_SEQUENCES,
                 },
             ),
@@ -194,7 +195,7 @@ def get_magpie_generator(system_prompt, num_turns, temperature, is_sample):
                 generation_kwargs={
                     "temperature": temperature,
                     "do_sample": True,
-                    "max_new_tokens": 256 if is_sample else 1024,
                     "stop_sequences": _STOP_SEQUENCES,
                 },
             ),
@@ -217,7 +218,7 @@ def get_response_generator(system_prompt, num_turns, temperature, is_sample):
                 api_key=_get_next_api_key(),
                 generation_kwargs={
                     "temperature": temperature,
-                    "max_new_tokens": 256 if is_sample else 1024,
                 },
             ),
             system_prompt=system_prompt,
@@ -233,7 +234,7 @@ def get_response_generator(system_prompt, num_turns, temperature, is_sample):
                 api_key=_get_next_api_key(),
                 generation_kwargs={
                     "temperature": temperature,
-                    "max_new_tokens": 2048,
                 },
             ),
             output_mappings={"generation": "completion"},
@@ -268,7 +269,7 @@ with Pipeline(name="sft") as pipeline:
             generation_kwargs={{
                 "temperature": {temperature},
                 "do_sample": True,
-                "max_new_tokens": 2048,
                 "stop_sequences": {_STOP_SEQUENCES}
             }},
             api_key=os.environ["BASE_URL"],

 from synthetic_dataset_generator.constants import (
     BASE_URL,
     MAGPIE_PRE_QUERY_TEMPLATE,
+    MAX_NUM_TOKENS,
     MODEL,
 )
 from synthetic_dataset_generator.pipelines.base import _get_next_api_key
             base_url=BASE_URL,
             generation_kwargs={
                 "temperature": 0.8,
+                "max_new_tokens": MAX_NUM_TOKENS,
                 "do_sample": True,
             },
         ),
                 generation_kwargs={
                     "temperature": temperature,
                     "do_sample": True,
+                    "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
                     "stop_sequences": _STOP_SEQUENCES,
                 },
             ),
                 generation_kwargs={
                     "temperature": temperature,
                     "do_sample": True,
+                    "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
                     "stop_sequences": _STOP_SEQUENCES,
                 },
             ),
                 api_key=_get_next_api_key(),
                 generation_kwargs={
                     "temperature": temperature,
+                    "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
                 },
             ),
             system_prompt=system_prompt,
                 api_key=_get_next_api_key(),
                 generation_kwargs={
                     "temperature": temperature,
+                    "max_new_tokens": MAX_NUM_TOKENS,
                 },
             ),
             output_mappings={"generation": "completion"},
             generation_kwargs={{
                 "temperature": {temperature},
                 "do_sample": True,
+                "max_new_tokens": {MAX_NUM_TOKENS},
                 "stop_sequences": {_STOP_SEQUENCES}
             }},
             api_key=os.environ["BASE_URL"],

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -9,7 +9,7 @@ from distilabel.steps.tasks import (
 )
 from pydantic import BaseModel, Field
-from synthetic_dataset_generator.constants import BASE_URL, MODEL
 from synthetic_dataset_generator.pipelines.base import _get_next_api_key
 from synthetic_dataset_generator.utils import get_preprocess_labels
@@ -69,7 +69,7 @@ def get_prompt_generator():
             structured_output={"format": "json", "schema": TextClassificationTask},
             generation_kwargs={
                 "temperature": 0.8,
-                "max_new_tokens": 2048,
                 "do_sample": True,
             },
         ),
@@ -88,7 +88,7 @@ def get_textcat_generator(difficulty, clarity, temperature, is_sample):
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": temperature,
-                "max_new_tokens": 256 if is_sample else 2048,
                 "do_sample": True,
                 "top_k": 50,
                 "top_p": 0.95,
@@ -110,7 +110,7 @@ def get_labeller_generator(system_prompt, labels, num_labels):
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.7,
-                "max_new_tokens": 2048,
             },
         ),
         context=system_prompt,
@@ -159,7 +159,7 @@ with Pipeline(name="textcat") as pipeline:
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
                 "temperature": {temperature},
-                "max_new_tokens": 2048,
                 "do_sample": True,
                 "top_k": 50,
                 "top_p": 0.95,
@@ -203,7 +203,7 @@ with Pipeline(name="textcat") as pipeline:
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
                 "temperature": 0.8,
-                "max_new_tokens": 2048,
             }},
         ),
         n={num_labels},

 )
 from pydantic import BaseModel, Field
+from synthetic_dataset_generator.constants import BASE_URL, MAX_NUM_TOKENS, MODEL
 from synthetic_dataset_generator.pipelines.base import _get_next_api_key
 from synthetic_dataset_generator.utils import get_preprocess_labels
             structured_output={"format": "json", "schema": TextClassificationTask},
             generation_kwargs={
                 "temperature": 0.8,
+                "max_new_tokens": MAX_NUM_TOKENS,
                 "do_sample": True,
             },
         ),
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": temperature,
+                "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
                 "do_sample": True,
                 "top_k": 50,
                 "top_p": 0.95,
             api_key=_get_next_api_key(),
             generation_kwargs={
                 "temperature": 0.7,
+                "max_new_tokens": MAX_NUM_TOKENS,
             },
         ),
         context=system_prompt,
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
                 "temperature": {temperature},
+                "max_new_tokens": {MAX_NUM_TOKENS},
                 "do_sample": True,
                 "top_k": 50,
                 "top_p": 0.95,
             api_key=os.environ["API_KEY"],
             generation_kwargs={{
                 "temperature": 0.8,
+                "max_new_tokens": {MAX_NUM_TOKENS},
             }},
         ),
         n={num_labels},