from datasets import get_dataset_config_names, get_dataset_split_names from distilabel.steps.tasks import ( ChatGeneration, Magpie, GenerateSentencePair, TextGeneration, ) from synthetic_dataset_generator.constants import ( MAGPIE_PRE_QUERY_TEMPLATE, MAX_NUM_TOKENS, ) from synthetic_dataset_generator.pipelines.base import _get_llm, _get_llm_class INFORMATION_SEEKING_PROMPT = ( "You are an AI assistant designed to provide accurate and concise information on a wide" " range of topics. Your purpose is to assist users in finding specific facts," " explanations, or details about various subjects. Provide clear, factual responses and," " when appropriate, offer additional context or related information that might be useful" " to the user." ) REASONING_PROMPT = ( "You are an AI assistant specialized in logical thinking and problem-solving. Your" " purpose is to help users work through complex ideas, analyze situations, and draw" " conclusions based on given information. Approach each query with structured thinking," " break down problems into manageable parts, and guide users through the reasoning" " process step-by-step." ) PLANNING_PROMPT = ( "You are an AI assistant focused on helping users create effective plans and strategies." " Your purpose is to assist in organizing thoughts, setting goals, and developing" " actionable steps for various projects or activities. Offer structured approaches," " consider potential challenges, and provide tips for efficient execution of plans." ) EDITING_PROMPT = ( "You are an AI assistant specialized in editing and improving written content. Your" " purpose is to help users refine their writing by offering suggestions for grammar," " style, clarity, and overall structure. Provide constructive feedback, explain your" " edits, and offer alternative phrasings when appropriate." ) CODING_DEBUGGING_PROMPT = ( "You are an AI assistant designed to help with programming tasks. Your purpose is to" " assist users in writing, reviewing, and debugging code across various programming" " languages. Provide clear explanations, offer best practices, and help troubleshoot" " issues. When appropriate, suggest optimizations or alternative approaches to coding" " problems." ) MATH_SYSTEM_PROMPT = ( "You are an AI assistant designed to provide helpful, step-by-step guidance on solving" " math problems. The user will ask you a wide range of complex mathematical questions." " Your purpose is to assist users in understanding mathematical concepts, working through" " equations, and arriving at the correct solutions." ) ROLE_PLAYING_PROMPT = ( "You are an AI assistant capable of engaging in various role-playing scenarios. Your" " purpose is to adopt different personas or characters as requested by the user. Maintain" " consistency with the chosen role, respond in character, and help create immersive and" " interactive experiences for the user." ) DATA_ANALYSIS_PROMPT = ( "You are an AI assistant specialized in data analysis and interpretation. Your purpose is" " to help users understand and derive insights from data sets, statistics, and analytical" " tasks. Offer clear explanations of data trends, assist with statistical calculations," " and provide guidance on data visualization and interpretation techniques." ) CREATIVE_WRITING_PROMPT = ( "You are an AI assistant designed to support creative writing endeavors. Your purpose is" " to help users craft engaging stories, poems, and other creative texts. Offer" " suggestions for plot development, character creation, dialogue writing, and other" " aspects of creative composition. Provide constructive feedback and inspire creativity." ) ADVICE_SEEKING_PROMPT = ( "You are an AI assistant focused on providing thoughtful advice and guidance. Your" " purpose is to help users navigate various personal or professional issues by offering" " balanced perspectives, considering potential outcomes, and suggesting practical" " solutions. Encourage users to think critically about their situations while providing" " supportive and constructive advice." ) BRAINSTORMING_PROMPT = ( "You are an AI assistant specialized in generating ideas and facilitating creative" " thinking. Your purpose is to help users explore possibilities, think outside the box," " and develop innovative concepts. Encourage free-flowing thoughts, offer diverse" " perspectives, and help users build upon and refine their ideas." ) PROMPT_CREATION_PROMPT = f"""You are an AI assistant specialized in generating very precise prompts for dataset creation. Your task is to write a prompt following the instruction of the user. Respond with the prompt and nothing else. In the generated prompt always finish with this sentence: User questions are direct and concise. The prompt you write should follow the same style and structure as the following example prompts: {INFORMATION_SEEKING_PROMPT} {REASONING_PROMPT} {PLANNING_PROMPT} {CODING_DEBUGGING_PROMPT} {EDITING_PROMPT} {ROLE_PLAYING_PROMPT} {DATA_ANALYSIS_PROMPT} {CREATIVE_WRITING_PROMPT} {ADVICE_SEEKING_PROMPT} {BRAINSTORMING_PROMPT} User dataset description: """ FOLLOW_UP_TEMPLATE = """Conversation: {% for message in messages %} {% if message.role == "user" %} User Question: {{ message.content }} {% elif message.role == "assistant" %} Assistant Response: {{ message.content }} {% endif %} {% endfor %} Please generate the next logical user message in this conversation. Do not include any other information or 'User Question' in your response. """.rstrip() DEFAULT_DATASET_DESCRIPTIONS = [ "rude customer assistant for a phone company", "assistant that solves math puzzles using python", ] if MAGPIE_PRE_QUERY_TEMPLATE == "llama3": _STOP_SEQUENCES = [ "<|eot_id|>", "<|start_header_id|>", "assistant", " \n\n", ] elif MAGPIE_PRE_QUERY_TEMPLATE == "qwen2": _STOP_SEQUENCES = ["<|im_end|>", "<|im_start|>", "assistant", "\n\n"] else: _STOP_SEQUENCES = [ "<|eot_id|>", "<|start_header_id|>", "assistant", " \n\n", ] def _get_output_mappings(num_turns: int): if num_turns == 1: return {"instruction": "prompt", "response": "completion"} else: return {"conversation": "messages"} def get_prompt_generator(): generation_kwargs = { "temperature": 0.8, "max_new_tokens": MAX_NUM_TOKENS, "do_sample": True, } prompt_generator = TextGeneration( llm=_get_llm(generation_kwargs=generation_kwargs), system_prompt=PROMPT_CREATION_PROMPT, use_system_prompt=True, ) prompt_generator.load() return prompt_generator def get_magpie_generator(num_turns: int, temperature: float, is_sample: bool): input_mappings = _get_output_mappings(num_turns) output_mappings = input_mappings.copy() if num_turns == 1: generation_kwargs = { "temperature": temperature, "do_sample": True, "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.25), "stop_sequences": _STOP_SEQUENCES, } magpie_generator = Magpie( llm=_get_llm( generation_kwargs=generation_kwargs, magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE, use_magpie_template=True, ), n_turns=num_turns, output_mappings=output_mappings, only_instruction=True, ) else: generation_kwargs = { "temperature": temperature, "do_sample": True, "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5), "stop_sequences": _STOP_SEQUENCES, } magpie_generator = Magpie( llm=_get_llm( generation_kwargs=generation_kwargs, magpie_pre_query_template=MAGPIE_PRE_QUERY_TEMPLATE, use_magpie_template=True, ), end_with_user=True, n_turns=num_turns, output_mappings=output_mappings, ) magpie_generator.load() return magpie_generator def get_sentence_pair_generator(temperature: float, is_sample: bool): generation_kwargs = { "temperature": temperature, "max_new_tokens": 256 if is_sample else MAX_NUM_TOKENS, } sentence_pair_generator = GenerateSentencePair( llm=_get_llm(generation_kwargs=generation_kwargs), triplet=False, action="query", hard_negative=True, ) sentence_pair_generator.load() return sentence_pair_generator def get_response_generator( system_prompt: str, num_turns: int, temperature: float, is_sample: bool ): if num_turns == 1: generation_kwargs = { "temperature": temperature, "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5), } response_generator = TextGeneration( llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs), system_prompt=system_prompt, output_mappings={"generation": "completion"}, input_mappings={"instruction": "prompt"}, ) else: generation_kwargs = { "temperature": temperature, "max_new_tokens": MAX_NUM_TOKENS, } response_generator = ChatGeneration( llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs), output_mappings={"generation": "completion"}, input_mappings={"conversation": "messages"}, ) response_generator.load() return response_generator def get_follow_up_generator(type: str, temperature: float, is_sample: bool): if type == "instruction": generation_kwargs = { "temperature": temperature, "max_new_tokens": 256 if is_sample else int(MAX_NUM_TOKENS * 0.5), } follow_up_generator = TextGeneration( llm=_get_llm(generation_kwargs=generation_kwargs), template=FOLLOW_UP_TEMPLATE, columns=["messages"], ) else: generation_kwargs = { "temperature": temperature, "max_new_tokens": MAX_NUM_TOKENS, } follow_up_generator = ChatGeneration( llm=_get_llm(is_completion=True, generation_kwargs=generation_kwargs), ) follow_up_generator.load() return follow_up_generator def generate_pipeline_code_system_prompt( system_prompt: str, num_turns: int, num_rows: int, ): input_mappings = _get_output_mappings(num_turns) code = f""" # Requirements: `pip install distilabel[hf-inference-endpoints]` import os from distilabel.pipeline import Pipeline from distilabel.steps import KeepColumns from distilabel.steps.tasks import MagpieGenerator from distilabel.llms import {_get_llm_class()} SYSTEM_PROMPT = "{system_prompt}" with Pipeline(name="sft") as pipeline: magpie = MagpieGenerator( llm={_get_llm_class()}.from_dict( {_get_llm().dump()} ), n_turns={num_turns}, num_rows={num_rows}, batch_size=1, system_prompt=SYSTEM_PROMPT, output_mappings={input_mappings}, ) keep_columns = KeepColumns( columns={list(input_mappings.values())} + ["model_name"], ) magpie.connect(keep_columns) if __name__ == "__main__": distiset = pipeline.run() """ return code def generate_pipeline_code_seed( repo_id: str, subset: str, split: str, input_type: str, document_column: str, num_turns: int, num_rows: int, ): code = f""" # Requirements: `pip install distilabel[hf-inference-endpoints]` from distilabel.models import {_get_llm_class()} from distilabel.pipeline import Pipeline from distilabel.steps import KeepColumns{", LoadDataFromDicts" if input_type != "dataset-input" else ""}{", LoadDataFromHub" if input_type == "dataset-input" else ""}{", StepInput, step" if num_turns > 1 else ""} from distilabel.steps.tasks import GenerateSentencePair, TextGeneration {", ChatGeneration" if num_turns > 1 else ""} """ if num_turns > 1: code += """ FOLLOW_UP_TEMPLATE = '''Conversation: {{% for message in messages %}} {{% if message.role == "user" %}} User Question: {{{{ message.content }}}} {{% elif message.role == "assistant" %}} Assistant Response: {{{{ message.content }}}} {{% endif %}} {{% endfor %}} Please generate the next logical user message in this conversation. Do not include any other information or 'User Question' in your response. '''.rstrip() @step(inputs=["prompt", "completion"], outputs=["messages"]) def PrepareMessages(*inputs: StepInput) -> StepOutput: for input in inputs: for item in input: item["messages"] = [ {"role": "user", "content": item["prompt"]}, {"role": "assistant", "content": item["completion"]}, ] yield input @step(inputs=["messages", "generation"], outputs=["messages"]) def FormatMessagesInstruction(*inputs: StepInput) -> StepOutput: for input in inputs: for item in input: item["messages"].append({"role": "user", "content": item["generation"]}) yield input @step(inputs=["messages", "generation"], outputs=["messages"]) def FormatMessagesResponse(*inputs: StepInput) -> StepOutput: for input in inputs: for item in input: item["messages"].append({"role": "assistant", "content": item["generation"]}) yield input """ if input_type == "dataset-input": code += f""" with Pipeline(name="sft") as pipeline: load_the_dataset = LoadDataFromHub( repo_id='{repo_id}', config='{subset}', split='{split}', num_examples={num_rows}, batch_size=2, output_mappings={{'{document_column}':'anchor'}}, ) """ else: code += """ data = process_and_chunk_files(files=[files]) with Pipeline(name="sft") as pipeline: load_the_dataset = LoadDataFromDicts( data = data ) """ code += f""" instruction_generator = GenerateSentencePair( name="instruction_generation", triplet=False, hard_negative=True, action="query", llm={_get_llm_class()}.from_dict( {_get_llm().dump()} ), input_batch_size=10, output_mappings={{"positive": "prompt"}}, ) response_generator = TextGeneration( name="response_generation", llm={_get_llm_class()}.from_dict( {_get_llm().dump()} ), input_batch_size=10, input_mappings={{"instruction": "prompt"}}, output_mappings={{"generation": "completion"}}, ) """ if num_turns > 1: code += """ prepare_messages = PrepareMessages() """ for i in range(num_turns - 1): code += f""" follow_up_instruction_{i} = TextGeneration( llm={_get_llm_class()}.from_dict( {_get_llm().dump()} ), template=FOLLOW_UP_TEMPLATE, columns=["messages"], ) format_instruction_{i} = FormatMessagesInstruction() follow_up_response_{i} = ChatGeneration( llm={_get_llm_class()}.from_dict( {_get_llm().dump()} ), ) format_response_{i} = FormatMessagesResponse() """ if num_turns > 1: code += """ keep_columns = KeepColumns(columns=["messages"]) """ code += "load_the_dataset >> instruction_generator >> response_generator >> prepare_messages" for i in range(1, num_turns + 1): code += f" >> follow_up_instruction_{i} >> format_instruction_{i} >> follow_up_response_{i} >> format_response_{i}" code += " >> keep_columns" code += """ if __name__ == "__main__": distiset = pipeline.run() ) """ return code def generate_pipeline_code( repo_id: str, input_type: str, system_prompt: str, document_column: str, num_turns: int, num_rows: int, ): if input_type == "dataset-input" and repo_id is not None: subset = get_dataset_config_names(repo_id)[0] split = get_dataset_split_names(repo_id, subset)[0] else: subset = "default" split = "train" if input_type == "prompt-type": return generate_pipeline_code_system_prompt( system_prompt=system_prompt, num_turns=num_turns, num_rows=num_rows, ) return generate_pipeline_code_seed( repo_id=repo_id, subset=subset, split=split, input_type=input_type, document_column=document_column, num_turns=num_turns, num_rows=num_rows, )