from src.dataset import formatting_prompts_func def test_formatting_prompts_func(): # Test case with basic input examples = { "instruction": ["Test instruction"], "input": ["Test input"], "output": ["Test output"], } template = "Instruction: {}\nInput: {}\nOutput: {}" eos_token = "" result = formatting_prompts_func(examples, template, eos_token) # Check if result contains the 'text' key assert "text" in result # Check if result contains exactly one formatted entry assert len(result["text"]) == 1 # Check if the formatted text is correct expected = "Instruction: Test instruction\nInput: Test input\nOutput: Test output" assert result["text"][0] == expected # Test with empty inputs (edge case) examples_empty = { "instruction": [""], "input": [""], "output": [""], } result_empty = formatting_prompts_func(examples_empty, template, eos_token) assert result_empty["text"][0] == "Instruction: \nInput: \nOutput: " # Test with multiple examples examples_multi = { "instruction": ["Test instruction 1", "Test instruction 2"], "input": ["Test input 1", "Test input 2"], "output": ["Test output 1", "Test output 2"], } result_multi = formatting_prompts_func(examples_multi, template, eos_token) assert len(result_multi["text"]) == 2 assert result_multi["text"][0] == "Instruction: Test instruction 1\nInput: Test input 1\nOutput: Test output 1" assert result_multi["text"][1] == "Instruction: Test instruction 2\nInput: Test input 2\nOutput: Test output 2"