davidberenstein1957's picture
add MAX_NUM_ROWS
7b7c1be
raw
history blame
7.44 kB
from datasets import get_dataset_config_names, get_dataset_split_names
from distilabel.llms import InferenceEndpointsLLM
from distilabel.steps.tasks import (
TextGeneration,
UltraFeedback,
)
from synthetic_dataset_generator.constants import BASE_URL, MAX_NUM_TOKENS, MODEL
from synthetic_dataset_generator.pipelines.base import _get_next_api_key
from synthetic_dataset_generator.utils import extract_column_names
def get_ultrafeedback_evaluator(aspect, is_sample):
ultrafeedback_evaluator = UltraFeedback(
llm=InferenceEndpointsLLM(
model_id=MODEL,
base_url=BASE_URL,
api_key=_get_next_api_key(),
generation_kwargs={
"temperature": 0.01,
"max_new_tokens": MAX_NUM_TOKENS if not is_sample else 512,
},
),
aspect=aspect,
)
ultrafeedback_evaluator.load()
return ultrafeedback_evaluator
def get_custom_evaluator(prompt_template, structured_output, columns, is_sample):
custom_evaluator = TextGeneration(
llm=InferenceEndpointsLLM(
model_id=MODEL,
base_url=BASE_URL,
api_key=_get_next_api_key(),
structured_output={"format": "json", "schema": structured_output},
generation_kwargs={
"temperature": 0.01,
"max_new_tokens": MAX_NUM_TOKENS if not is_sample else 512,
},
),
template=prompt_template,
columns=columns,
)
custom_evaluator.load()
return custom_evaluator
def generate_ultrafeedback_pipeline_code(
repo_id, subset, split, aspects, instruction_column, response_columns, num_rows
):
if len(aspects) == 1:
code = f"""
# Requirements: `pip install distilabel[hf-inference-endpoints]`
import os
from datasets import load_dataset
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromDicts
from distilabel.steps.tasks import UltraFeedback
from distilabel.llms import InferenceEndpointsLLM
MODEL = "{MODEL}"
BASE_URL = "{BASE_URL}"
os.environ["API_KEY"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}[:{num_rows}]")
data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
with Pipeline(name="ultrafeedback") as pipeline:
load_the_dataset = LoadDataFromDicts(
data = data,
)
ultrafeedback_evaluator = UltraFeedback(
llm=InferenceEndpointsLLM(
model_id=MODEL,
base_url=BASE_URL,
api_key=os.environ["API_KEY"],
generation_kwargs={{
"temperature": 0.01,
"max_new_tokens": {MAX_NUM_TOKENS},
}},
),
aspect=aspect,
)
load_the_dataset >> ultrafeedback_evaluator
if __name__ == "__main__":
distiset = pipeline.run()
"""
else:
code = f"""
# Requirements: `pip install distilabel[hf-inference-endpoints]`
import os
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromDicts, CombineOutputs
from distilabel.steps.tasks import UltraFeedback
from distilabel.llms import InferenceEndpointsLLM
MODEL = "{MODEL}"
BASE_URL = "{BASE_URL}"
os.environ["BASE_URL"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
hf_ds = load_dataset("{repo_id}", "{subset}", split="{split}")
data = preprocess_data(hf_ds, "{instruction_column}", "{response_columns}") # to get a list of dictionaries
with Pipeline(name="ultrafeedback") as pipeline:
load_the_dataset = LoadDataFromDicts(
data = data,
)
tasks = []
for aspect in aspects:
evaluate_responses = UltraFeedback(
name=f"evaluate-responses-{{aspect}}",
aspect=aspect,
llm=InferenceEndpointsLLM(
model_id=MODEL,
base_url=BASE_URL,
api_key=os.environ["BASE_URL"],
generation_kwargs={{
"temperature": 0.01,
"max_new_tokens": {MAX_NUM_TOKENS},
}},
output_mappings={{
"ratings": f"ratings_{{aspect}}",
"types": f"type_{{aspect}}",
"rationales": f"rationales_for_types_{{aspect}}",
"rationales-for-ratings": f"rationales_for_ratings_{{aspect}}",
}} if aspect in ["truthfulness", "helpfulness"] else {{"rationales": f"rationales_{{aspect}}", "ratings": f"ratings_{{aspect}}"}},
)
tasks.append(evaluate_responses)
combine_outputs = CombineOutputs()
load_the_dataset >> tasks >> combine_outputs
if __name__ == "__main__":
distiset = pipeline.run()
"""
return code
def generate_custom_pipeline_code(
repo_id, subset, split, prompt_template, structured_output, num_rows
):
columns = extract_column_names(structured_output)
code = f"""
# Requirements: `pip install distilabel[hf-inference-endpoints, instructor]`
import os
from distilabel.pipeline import Pipeline
from distilabel.steps import LoadDataFromHub
from distilabel.steps.tasks import TextGeneration
from distilabel.llms import InferenceEndpointsLLM
MODEL = "{MODEL}"
BASE_URL = "{BASE_URL}"
CUSTOM_TEMPLATE = "{prompt_template}"
os.environ["HF_TOKEN"] = "hf_xxx" # https://huggingface.co/settings/tokens/new?ownUserPermissions=repo.content.read&ownUserPermissions=repo.write&globalPermissions=inference.serverless.write&canReadGatedRepos=true&tokenType=fineGrained
with Pipeline(name="custom-evaluation") as pipeline:
load_the_dataset = LoadDataFromHub(
repo_id="{repo_id}",
config="{subset}",
split="{split}",
num_examples={num_rows},
batch_size=2
)
custom_evaluator = TextGeneration(
llm=InferenceEndpointsLLM(
model_id=MODEL,
base_url=BASE_URL,
api_key=os.environ["HF_TOKEN"],
structured_output={{"format": "json", "schema": {structured_output}}},
generation_kwargs={{
"temperature": 0.01,
"max_new_tokens": {MAX_NUM_TOKENS},
}},
),
template=CUSTOM_TEMPLATE,
columns={columns}
)
load_the_dataset >> custom_evaluator
if __name__ == "__main__":
distiset = pipeline.run()
"""
return code
def generate_pipeline_code(
repo_id,
aspects,
instruction_column,
response_columns,
prompt_template,
structured_output,
num_rows,
eval_type,
):
if repo_id is None:
subset = "default"
split = "train"
else:
subset = get_dataset_config_names(repo_id)[0]
split = get_dataset_split_names(repo_id, subset)[0]
if eval_type == "ultrafeedback":
return generate_ultrafeedback_pipeline_code(
repo_id,
subset,
split,
aspects,
instruction_column,
response_columns,
num_rows,
)
return generate_custom_pipeline_code(
repo_id, subset, split, prompt_template, structured_output, num_rows
)