Spaces:

argilla
/

synthetic-data-generator

Running

File size: 17,029 Bytes

from datasets import get_dataset_config_names, get_dataset_split_names
from distilabel.steps.tasks import (
    ChatGeneration,
    Magpie,
    GenerateSentencePair,
    TextGeneration,
)

from synthetic_dataset_generator.constants import (
    MAGPIE_PRE_QUERY_TEMPLATE,
    MAX_NUM_TOKENS,
)
from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class

INFORMATION_SEEKING_PROMPT = (
    "You are an AI assistant designed to provide accurate and concise information on a wide"
    " range of topics. Your purpose is to assist users in finding specific facts,"
    " explanations, or details about various subjects. Provide clear, factual responses and,"
    " when appropriate, offer additional context or related information that might be useful"
    " to the user."
)

REASONING_PROMPT = (
    "You are an AI assistant specialized in logical thinking and problem-solving. Your"
    " purpose is to help users work through complex ideas, analyze situations, and draw"
    " conclusions based on given information. Approach each query with structured thinking,"
    " break down problems into manageable parts, and guide users through the reasoning"
    " process step-by-step."
)

PLANNING_PROMPT = (
    "You are an AI assistant focused on helping users create effective plans and strategies."
    " Your purpose is to assist in organizing thoughts, setting goals, and developing"
    " actionable steps for various projects or activities. Offer structured approaches,"
    " consider potential challenges, and provide tips for efficient execution of plans."
)

EDITING_PROMPT = (
    "You are an AI assistant specialized in editing and improving written content. Your"
    " purpose is to help users refine their writing by offering suggestions for grammar,"
    " style, clarity, and overall structure. Provide constructive feedback, explain your"
    " edits, and offer alternative phrasings when appropriate."
)

CODING_DEBUGGING_PROMPT = (
    "You are an AI assistant designed to help with programming tasks. Your purpose is to"
    " assist users in writing, reviewing, and debugging code across various programming"
    " languages. Provide clear explanations, offer best practices, and help troubleshoot"
    " issues. When appropriate, suggest optimizations or alternative approaches to coding"
    " problems."
)

MATH_SYSTEM_PROMPT = (
    "You are an AI assistant designed to provide helpful, step-by-step guidance on solving"
    " math problems. The user will ask you a wide range of complex mathematical questions."
    " Your purpose is to assist users in understanding mathematical concepts, working through"
    " equations, and arriving at the correct solutions."
)

ROLE_PLAYING_PROMPT = (
    "You are an AI assistant capable of engaging in various role-playing scenarios. Your"
    " purpose is to adopt different personas or characters as requested by the user. Maintain"
    " consistency with the chosen role, respond in character, and help create immersive and"
    " interactive experiences for the user."
)

DATA_ANALYSIS_PROMPT = (
    "You are an AI assistant specialized in data analysis and interpretation. Your purpose is"
    " to help users understand and derive insights from data sets, statistics, and analytical"
    " tasks. Offer clear explanations of data trends, assist with statistical calculations,"
    " and provide guidance on data visualization and interpretation techniques."
)

CREATIVE_WRITING_PROMPT = (
    "You are an AI assistant designed to support creative writing endeavors. Your purpose is"
    " to help users craft engaging stories, poems, and other creative texts. Offer"
    " suggestions for plot development, character creation, dialogue writing, and other"
    " aspects of creative composition. Provide constructive feedback and inspire creativity."
)

ADVICE_SEEKING_PROMPT = (
    "You are an AI assistant focused on providing thoughtful advice and guidance. Your"
    " purpose is to help users navigate various personal or professional issues by offering"
    " balanced perspectives, considering potential outcomes, and suggesting practical"
    " solutions. Encourage users to think critically about their situations while providing"
    " supportive and constructive advice."
)

BRAINSTORMING_PROMPT = (
    "You are an AI assistant specialized in generating ideas and facilitating creative"
    " thinking. Your purpose is to help users explore possibilities, think outside the box,"
    " and develop innovative concepts. Encourage free-flowing thoughts, offer diverse"
    " perspectives, and help users build upon and refine their ideas."
)

PROMPT_CREATION_PROMPT = f"""You are an AI assistant specialized in generating very precise prompts for dataset creation.

Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else.

In the generated prompt always finish with this sentence: User questions are direct and concise.

The prompt you write should follow the same style and structure as the following example prompts:

{INFORMATION_SEEKING_PROMPT}

{REASONING_PROMPT}

{PLANNING_PROMPT}

{CODING_DEBUGGING_PROMPT}

{EDITING_PROMPT}

{ROLE_PLAYING_PROMPT}

{DATA_ANALYSIS_PROMPT}

{CREATIVE_WRITING_PROMPT}

{ADVICE_SEEKING_PROMPT}

{BRAINSTORMING_PROMPT}

User dataset description:
"""

FOLLOW_UP_TEMPLATE = """Conversation:
{% for message in messages %}
    {% if message.role == "user" %}
User Question: {{ message.content }}
    {% elif message.role == "assistant" %}
Assistant Response: {{ message.content }}
    {% endif %}
{% endfor %}

Please generate the next logical user message in this conversation. Do not include any other information or 'User Question' in your response.
""".rstrip()

DEFAULT_DATASET_DESCRIPTIONS = [
    "rude customer assistant for a phone company",
    "assistant that solves math puzzles using python",
]
if MAGPIE_PRE_QUERY_TEMPLATE == "llama3":
    _STOP_SEQUENCES = [
        "<|eot_id|>",
        "<|start_header_id|>",
        "assistant",
        " \n\n",
    ]
elif MAGPIE_PRE_QUERY_TEMPLATE == "qwen2":
    _STOP_SEQUENCES = ["<|im_end|>", "<|im_start|>", "assistant", "\n\n"]
else:
    _STOP_SEQUENCES = [
        "<|eot_id|>",
        "<|start_header_id|>",
        "assistant",
        " \n\n",
    ]


def _get_output_mappings(num_turns: int):
    if num_turns == 1:
        return {"instruction": "prompt", "response": "completion"}
    else:
        return {"conversation": "messages"}


def get_prompt_generator():
    generation_kwargs = {
        "temperature": 0.8,
        "max_new_tokens": MAX_NUM_TOKENS,
        "do_sample": True,
    }
    prompt_generator = TextGeneration(
        llm=_get_llm(generation_kwargs=generation_kwargs),
        system_prompt=PROMPT_CREATION_PROMPT,
        use_system_prompt=True,
    )
    prompt_generator.load()
    return prompt_generator


def get_magpie_generator(num_turns: int, temperature: float, is_sample: bool):
    input_mappings = _get_output_mappings(num_turns)
    output_mappings = input_mappings.copy()
    if num_turns == 1:
        generation_kwargs = {
            "temperature": temperature,
            "do_sample": True,
            "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.25),
            "stop_sequences": _STOP_SEQUENCES,
        }
        magpie_generator = Magpie(
            llm=_get_llm(
                generation_kwargs=generation_kwargs,
                magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
                use_magpie_template=True,
            ),
            n_turns=num_turns,
            output_mappings=output_mappings,
            only_instruction=True,
        )
    else:
        generation_kwargs = {
            "temperature": temperature,
            "do_sample": True,
            "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
            "stop_sequences": _STOP_SEQUENCES,
        }
        magpie_generator = Magpie(
            llm=_get_llm(
                generation_kwargs=generation_kwargs,
                magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE,
                use_magpie_template=True,
            ),
            end_with_user=True,
            n_turns=num_turns,
            output_mappings=output_mappings,
        )
    magpie_generator.load()
    return magpie_generator


def get_sentence_pair_generator(temperature: float, is_sample: bool):
    generation_kwargs = {
        "temperature": temperature,
        "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS,
    }
    sentence_pair_generator = GenerateSentencePair(
        llm=_get_llm(generation_kwargs=generation_kwargs),
        triplet=False,
        action="query",
        hard_negative=True,
    )
    sentence_pair_generator.load()
    return sentence_pair_generator


def get_response_generator(
    system_prompt: str, num_turns: int, temperature: float, is_sample: bool
):
    if num_turns == 1:
        generation_kwargs = {
            "temperature": temperature,
            "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
        }
        response_generator = TextGeneration(
            llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs),
            system_prompt=system_prompt,
            output_mappings={"generation": "completion"},
            input_mappings={"instruction": "prompt"},
        )
    else:
        generation_kwargs = {
            "temperature": temperature,
            "max_new_tokens": MAX_NUM_TOKENS,
        }
        response_generator = ChatGeneration(
            llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs),
            output_mappings={"generation": "completion"},
            input_mappings={"conversation": "messages"},
        )
    response_generator.load()
    return response_generator


def get_follow_up_generator(type: str, temperature: float, is_sample: bool):
    if type == "instruction":
        generation_kwargs = {
            "temperature": temperature,
            "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5),
        }
        follow_up_generator = TextGeneration(
            llm=_get_llm(generation_kwargs=generation_kwargs),
            template=FOLLOW_UP_TEMPLATE,
            columns=["messages"],
        )
    else:
        generation_kwargs = {
            "temperature": temperature,
            "max_new_tokens": MAX_NUM_TOKENS,
        }
        follow_up_generator = ChatGeneration(
            llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs),
        )
    follow_up_generator.load()
    return follow_up_generator

def generate_pipeline_code_system_prompt(
    system_prompt: str,
    num_turns: int,
    num_rows: int,
):
    input_mappings = _get_output_mappings(num_turns)
    code = f"""
    # Requirements: `pip install distilabel[hf-inference-endpoints]`
    import os
    from distilabel.pipeline import Pipeline
    from distilabel.steps import KeepColumns
    from distilabel.steps.tasks import MagpieGenerator
    from distilabel.llms import {_get_llm_class()}

    SYSTEM_PROMPT = "{system_prompt}"

    with Pipeline(name="sft") as pipeline:
        magpie = MagpieGenerator(
            llm={_get_llm_class()}.from_dict(
                {_get_llm().dump()}
            ),
            n_turns={num_turns},
            num_rows={num_rows},
            batch_size=1,
            system_prompt=SYSTEM_PROMPT,
            output_mappings={input_mappings},
        )
        keep_columns = KeepColumns(
            columns={list(input_mappings.values())} + ["model_name"],
        )
        magpie.connect(keep_columns)

    if __name__ == "__main__":
        distiset = pipeline.run()
    """
    return code

def generate_pipeline_code_seed(
    repo_id: str,
    subset: str,
    split: str,
    input_type: str,
    document_column: str,
    num_turns: int,
    num_rows: int,
):
    code = f"""
# Requirements: `pip install distilabel[hf-inference-endpoints]`
from distilabel.models import {_get_llm_class()}
from distilabel.pipeline import Pipeline
from distilabel.steps import KeepColumns{", LoadDataFromDicts" if input_type != "dataset-input"  else ""}{", LoadDataFromHub" if input_type == "dataset-input" else ""}{", StepInput, step" if num_turns > 1 else ""}
from distilabel.steps.tasks import GenerateSentencePair, TextGeneration {", ChatGeneration" if num_turns > 1 else ""}
"""

    if num_turns > 1:
        code += """
FOLLOW_UP_TEMPLATE = '''Conversation:
{{% for message in messages %}}
    {{% if message.role == "user" %}}
User Question: {{{{ message.content }}}}
    {{% elif message.role == "assistant" %}}
Assistant Response: {{{{ message.content }}}}
    {{% endif %}}
{{% endfor %}}

Please generate the next logical user message in this conversation. Do not include any other information or 'User Question' in your response.
'''.rstrip()

@step(inputs=["prompt", "completion"], outputs=["messages"])
def PrepareMessages(*inputs: StepInput) -> StepOutput:
    for input in inputs:
        for item in input:
            item["messages"] = [
                {"role": "user", "content": item["prompt"]},
                {"role": "assistant", "content": item["completion"]},
            ]
        yield input


@step(inputs=["messages", "generation"], outputs=["messages"])
def FormatMessagesInstruction(*inputs: StepInput) -> StepOutput:
    for input in inputs:
        for item in input:
            item["messages"].append({"role": "user", "content": item["generation"]})
        yield input


@step(inputs=["messages", "generation"], outputs=["messages"])
def FormatMessagesResponse(*inputs: StepInput) -> StepOutput:
    for input in inputs:
        for item in input:
            item["messages"].append({"role": "assistant", "content": item["generation"]})
        yield input
"""

    if input_type == "dataset-input":
        code += f"""
with Pipeline(name="sft") as pipeline:
    load_the_dataset = LoadDataFromHub(
        repo_id='{repo_id}',
        config='{subset}',
        split='{split}',
        num_examples={num_rows},
        batch_size=2,
        output_mappings={{'{document_column}':'anchor'}},
    )
    """

    else: 
        code += """
data = process_and_chunk_files(files=[files])

with Pipeline(name="sft") as pipeline:
    load_the_dataset = LoadDataFromDicts(
        data = data
    )
"""
    code += f"""
    instruction_generator = GenerateSentencePair(
        name="instruction_generation",
        triplet=False,
        hard_negative=True,
        action="query",
        llm={_get_llm_class()}.from_dict(
            {_get_llm().dump()}
        ),
        input_batch_size=10,
        output_mappings={{"positive": "prompt"}},
    )

    response_generator = TextGeneration(
        name="response_generation",
        llm={_get_llm_class()}.from_dict(
            {_get_llm().dump()}
        ),
        input_batch_size=10,
        input_mappings={{"instruction": "prompt"}},
        output_mappings={{"generation": "completion"}},
    )
    """

    if num_turns > 1:
        code += """
    prepare_messages = PrepareMessages()
    """

        for i in range(num_turns - 1):
            code += f"""
    follow_up_instruction_{i} = TextGeneration(
        llm={_get_llm_class()}.from_dict(
            {_get_llm().dump()}
        ),
        template=FOLLOW_UP_TEMPLATE,
        columns=["messages"],
    )
    format_instruction_{i} = FormatMessagesInstruction()
    follow_up_response_{i} = ChatGeneration(
        llm={_get_llm_class()}.from_dict(
            {_get_llm().dump()}
        ),
    )
    format_response_{i} = FormatMessagesResponse()
    """

    if num_turns > 1:
        code += """
        keep_columns = KeepColumns(columns=["messages"])
        """
        code += "load_the_dataset >> instruction_generator >> response_generator >> prepare_messages"

        for i in range(1, num_turns + 1):
            code += f" >> follow_up_instruction_{i} >> format_instruction_{i} >> follow_up_response_{i} >> format_response_{i}"

        code += " >> keep_columns"

    code += """
if __name__ == "__main__":
    distiset = pipeline.run()
)
"""
    return code

def generate_pipeline_code(
    repo_id: str,
    input_type: str,
    system_prompt: str,
    document_column: str,
    num_turns: int,
    num_rows: int,
):
    if input_type == "dataset-input" and repo_id is not None:
        subset = get_dataset_config_names(repo_id)[0]
        split = get_dataset_split_names(repo_id, subset)[0]
    else:
        subset = "default"
        split = "train"
    if input_type == "prompt-type":
        return generate_pipeline_code_system_prompt(
            system_prompt=system_prompt,
            num_turns=num_turns,
            num_rows=num_rows,
        )
    return generate_pipeline_code_seed(
        repo_id=repo_id,
        subset=subset,
        split=split,
        input_type=input_type,
        document_column=document_column,
        num_turns=num_turns,
        num_rows=num_rows,
    )