Spaces:

argilla
/

synthetic-data-generator

Running

App Files Files Community

davidberenstein1957 commited on Jan 10

Commit

1fc08db

1 Parent(s): 371c76b

Add requirements.txt for synthetic dataset generator dependency, update import paths for InferenceEndpointsLLM in multiple files, and modify example deployment script to comment out HF_TOKEN.

Browse files

Files changed (7) hide show

examples/hf-serverless-deployment.py +1 -1
requirements.txt +1 -0
src/synthetic_dataset_generator/_inference_endpoints.py +2 -2
src/synthetic_dataset_generator/pipelines/base.py +1 -1
src/synthetic_dataset_generator/pipelines/chat.py +2 -1
src/synthetic_dataset_generator/pipelines/eval.py +4 -4
src/synthetic_dataset_generator/pipelines/textcat.py +5 -3

examples/hf-serverless-deployment.py CHANGED Viewed

@@ -8,7 +8,7 @@ import os
 from synthetic_dataset_generator import launch
-os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
 os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct"  # use instruct model
 os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # use the template for the model

 from synthetic_dataset_generator import launch
+# os.environ["HF_TOKEN"] = "hf_..."  # push the data to huggingface
 os.environ["MODEL"] = "meta-llama/Llama-3.1-8B-Instruct"  # use instruct model
 os.environ["MAGPIE_PRE_QUERY_TEMPLATE"] = "llama3"  # use the template for the model

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ -e git+https://github.com/argilla-io/synthetic-data-generator.git#egg=synthetic-dataset-generator

src/synthetic_dataset_generator/_inference_endpoints.py CHANGED Viewed

@@ -2,7 +2,7 @@ import warnings
 import distilabel
 import distilabel.distiset
-from distilabel.llms import InferenceEndpointsLLM
 from pydantic import (
     ValidationError,
     model_validator,
@@ -55,4 +55,4 @@ class CustomInferenceEndpointsLLM(InferenceEndpointsLLM):
         )
-distilabel.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM

 import distilabel
 import distilabel.distiset
+from distilabel.models import InferenceEndpointsLLM
 from pydantic import (
     ValidationError,
     model_validator,
         )
+distilabel.models.llms.InferenceEndpointsLLM = CustomInferenceEndpointsLLM

src/synthetic_dataset_generator/pipelines/base.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import math
 import random
-from distilabel.llms import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
 from distilabel.steps.tasks import TextGeneration
 from synthetic_dataset_generator.constants import (

 import math
 import random
+from distilabel.models import ClientvLLM, InferenceEndpointsLLM, OllamaLLM, OpenAILLM
 from distilabel.steps.tasks import TextGeneration
 from synthetic_dataset_generator.constants import (

src/synthetic_dataset_generator/pipelines/chat.py CHANGED Viewed

@@ -229,6 +229,7 @@ def get_response_generator(system_prompt, num_turns, temperature, is_sample):
 def generate_pipeline_code(system_prompt, num_turns, num_rows, temperature):
     input_mappings = _get_output_mappings(num_turns)
     code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
@@ -241,7 +242,7 @@ SYSTEM_PROMPT = "{system_prompt}"
 with Pipeline(name="sft") as pipeline:
     magpie = MagpieGenerator(
-        llm={_get_llm_class()}.from_json({_get_llm().model_dump_json()})},
         n_turns={num_turns},
         num_rows={num_rows},
         batch_size=1,

 def generate_pipeline_code(system_prompt, num_turns, num_rows, temperature):
     input_mappings = _get_output_mappings(num_turns)
     code = f"""
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
 with Pipeline(name="sft") as pipeline:
     magpie = MagpieGenerator(
+        llm={_get_llm_class()}.from_dict({_get_llm().model_dump()}),
         n_turns={num_turns},
         num_rows={num_rows},
         batch_size=1,

src/synthetic_dataset_generator/pipelines/eval.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from datasets import get_dataset_config_names, get_dataset_split_names
-from distilabel.llms import InferenceEndpointsLLM
 from distilabel.steps.tasks import (
     TextGeneration,
     UltraFeedback,
@@ -57,7 +57,7 @@ from datasets import load_dataset
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts
 from distilabel.steps.tasks import UltraFeedback
-from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
 BASE_URL = "{BASE_URL}"
@@ -97,7 +97,7 @@ import os
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, CombineOutputs
 from distilabel.steps.tasks import UltraFeedback
-from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
 BASE_URL = "{BASE_URL}"
@@ -154,7 +154,7 @@ import os
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromHub
 from distilabel.steps.tasks import TextGeneration
-from distilabel.llms import InferenceEndpointsLLM
 MODEL = "{MODEL}"
 BASE_URL = "{BASE_URL}"

 from datasets import get_dataset_config_names, get_dataset_split_names
+from distilabel.models import InferenceEndpointsLLM
 from distilabel.steps.tasks import (
     TextGeneration,
     UltraFeedback,
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts
 from distilabel.steps.tasks import UltraFeedback
+from distilabel.models import InferenceEndpointsLLM
 MODEL = "{MODEL}"
 BASE_URL = "{BASE_URL}"
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, CombineOutputs
 from distilabel.steps.tasks import UltraFeedback
+from distilabel.models import InferenceEndpointsLLM
 MODEL = "{MODEL}"
 BASE_URL = "{BASE_URL}"
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromHub
 from distilabel.steps.tasks import TextGeneration
+from distilabel.models import InferenceEndpointsLLM
 MODEL = "{MODEL}"
 BASE_URL = "{BASE_URL}"

src/synthetic_dataset_generator/pipelines/textcat.py CHANGED Viewed

@@ -133,17 +133,19 @@ def generate_pipeline_code(
 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
 import random
-from distilabel.llms import {_get_llm_class()}
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, KeepColumns
 from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
 with Pipeline(name="textcat") as pipeline:
     task_generator = LoadDataFromDicts(data=[{{"task": TEXT_CLASSIFICATION_TASK}}])
     textcat_generation = GenerateTextClassificationData(
-        llm={_get_llm_class()}.from_json({_get_llm().model_dump_json()}),
         seed=random.randint(0, 2**32 - 1),
         difficulty={None if difficulty == "mixed" else repr(difficulty)},
         clarity={None if clarity == "mixed" else repr(clarity)},
@@ -176,7 +178,7 @@ with Pipeline(name="textcat") as pipeline:
     )
     textcat_labeller = TextClassification(
-        llm={_get_llm_class()}.from_json({_get_llm().model_dump_json()}),
         n={num_labels},
         available_labels={labels},
         context=TEXT_CLASSIFICATION_TASK,

 # Requirements: `pip install distilabel[hf-inference-endpoints]`
 import os
 import random
+from distilabel.models import {_get_llm_class()}
 from distilabel.pipeline import Pipeline
 from distilabel.steps import LoadDataFromDicts, KeepColumns
 from distilabel.steps.tasks import {"GenerateTextClassificationData" if num_labels == 1 else "GenerateTextClassificationData, TextClassification"}
+SYSTEM_PROMPT = "{system_prompt}"
 with Pipeline(name="textcat") as pipeline:
     task_generator = LoadDataFromDicts(data=[{{"task": TEXT_CLASSIFICATION_TASK}}])
     textcat_generation = GenerateTextClassificationData(
+        llm={_get_llm_class()}.from_dict({_get_llm().model_dump()}),
         seed=random.randint(0, 2**32 - 1),
         difficulty={None if difficulty == "mixed" else repr(difficulty)},
         clarity={None if clarity == "mixed" else repr(clarity)},
     )
     textcat_labeller = TextClassification(
+        llm={_get_llm_class()}.from_dict({_get_llm().model_dump()}),
         n={num_labels},
         available_labels={labels},
         context=TEXT_CLASSIFICATION_TASK,