Spaces:
Sleeping
Sleeping
import re | |
from unittest import mock | |
from unittest.mock import MagicMock, patch | |
import numpy as np | |
import pandas as pd | |
import pytest | |
from llm_studio.app_utils.default_datasets import ( | |
prepare_default_dataset_causal_language_modeling, | |
) | |
from llm_studio.python_configs.text_causal_language_modeling_config import ( | |
ConfigNLPCausalLMDataset, | |
ConfigNLPCausalLMTokenizer, | |
ConfigProblemBase, | |
) | |
from llm_studio.src.datasets.text_causal_language_modeling_ds import CustomDataset | |
def test_prepare_default_dataset(tmp_path): | |
df = prepare_default_dataset_causal_language_modeling(tmp_path) | |
assert isinstance(df, pd.DataFrame) | |
assert set(df.keys()) == set( | |
["instruction", "output", "id", "parent_id", "lang", "rank"] | |
) | |
assert df.shape == (13026, 6) | |
def test_clean_output(): | |
output = { | |
"predicted_text": np.array( | |
[ | |
"This is a test", | |
"This is a test <stop> This is a test", | |
"This is a test <stop2> This is a test", | |
"This is a test <stop3> <stop> This is a test", | |
"<stop2> <stop> This is a test", | |
"This is a test <stop>", | |
] | |
) | |
} | |
cfg = mock.MagicMock() | |
cfg.tokenizer._stop_words = ["<stop>", "<stop2>", "<stop3>"] | |
predicted_text_clean = CustomDataset.clean_output(output=output, cfg=cfg)[ | |
"predicted_text" | |
] | |
assert predicted_text_clean == [ | |
"This is a test", | |
"This is a test", | |
"This is a test", | |
"This is a test", | |
"", | |
"This is a test", | |
] | |
def test_sanity_check_raises_error(): | |
mock_config = MagicMock() | |
mock_config.dataset.parent_id_column = "parent_id" | |
mock_config.dataset.id_column = "id" | |
mock_config.dataset.answer_column = "answer" | |
df_1 = pd.DataFrame( | |
{ | |
"id": [1, 2, 3, 4], | |
"parent_id": [2, None, 4, 1], | |
"answer": ["a", "b", "c", "d"], | |
"other_data": ["a", "b", "c", "d"], | |
} | |
) | |
CustomDataset.sanity_check(df_1, mock_config) | |
df_2 = pd.DataFrame( | |
{ | |
"id": [1, 2, 3, 4], | |
"parent_id": [None, None, None, None], | |
"answer": ["a", "b", "c", "d"], | |
"other_data": ["a", "b", "c", "d"], | |
} | |
) | |
CustomDataset.sanity_check(df_2, mock_config) | |
invalid_df_1 = pd.DataFrame( | |
{ | |
"id": [1, 2, 3, 4], | |
"parent_id": [1, 2, 3, 4], | |
"answer": ["a", "b", "c", "d"], | |
"other_data": ["a", "b", "c", "d"], | |
} | |
) | |
with pytest.raises( | |
AssertionError, | |
match=r"Parent id column:.* is the same as id column for some rows", | |
): | |
CustomDataset.sanity_check(invalid_df_1, mock_config) | |
invalid_df_2 = pd.DataFrame( | |
{ | |
"id": [1, 2, 3, 4], | |
"parent_id": [2, 3, 4, 1], | |
"other_data": ["a", "b", "c", "d"], | |
} | |
) | |
with pytest.raises( | |
AssertionError, | |
match=re.escape( | |
"Did not find any conversation chain. " | |
"Please ensure that some parent ids are empty." | |
"\n" | |
"Conversations are chained using parent id, " | |
"start conversation record should have empty parent id." | |
"\n" | |
f"Parent id column checked:{mock_config.dataset.parent_id_column}" | |
), | |
): | |
CustomDataset.sanity_check(invalid_df_2, mock_config) | |
def mock_auto_tokenizer(): | |
# from | |
# https://github.com/deepset-ai/haystack/blob/b5aef24a7ebac55cb4ba492baf81a85598700b94/test/conftest.py#L908 | |
with patch( | |
"transformers.AutoTokenizer.from_pretrained", autospec=True | |
) as mock_from_pretrained: | |
yield mock_from_pretrained | |
def test_init(mock_auto_tokenizer): | |
df = pd.DataFrame( | |
{ | |
"col_A": [1, 2, 3], | |
"col_B": [4, 5, 6], | |
} | |
) | |
cfg = mock.MagicMock() | |
cfg.dataset.prompt_column = "col_A" | |
cfg.dataset.answer_column = "col_B" | |
cfg.dataset.parent_id_column = "None" | |
cfg.dataset.system_column = "None" | |
cfg.dataset.text_system_start = "" | |
cfg.dataset.text_prompt_start = "" | |
cfg.dataset.text_answer_separator = "" | |
cfg.tokenizer.tokenizer_kwargs = '{"use_fast": true, "add_prefix_space": false}' | |
dataset = CustomDataset(df, cfg) | |
assert dataset.df.equals(df) | |
assert dataset.mode == "train" | |
def test_getitem(): | |
df = pd.DataFrame( | |
{ | |
"prompt": ["prompt 1", "prompt 2", "prompt 3"], | |
"answer": ["answer 1", "answer 2", "answer 3"], | |
"parent_id": [None, 0, 1], | |
"system": ["system 1", "system 2", "system 3"], | |
"id": [0, 1, 2], | |
} | |
) | |
cfg = ConfigProblemBase( | |
dataset=ConfigNLPCausalLMDataset( | |
prompt_column=("prompt",), | |
answer_column="answer", | |
parent_id_column="parent_id", | |
system_column="system", | |
text_system_start="System:", | |
text_prompt_start="Prompt:", | |
text_answer_separator="Answer:", | |
add_eos_token_to_answer=True, | |
limit_chained_samples=True, | |
), | |
tokenizer=ConfigNLPCausalLMTokenizer(max_length=513), | |
) | |
cfg.llm_backbone = "EleutherAI/pythia-2.8b-deduped" | |
dataset = CustomDataset(df, cfg) | |
assert len(dataset) == 1 | |
result = dataset[0] | |
assert isinstance(result, dict) | |
assert set(result.keys()) == { | |
"labels", | |
"input_ids", | |
"attention_mask", | |
"prompt_input_ids", | |
"prompt_attention_mask", | |
"answer_input_ids", | |
"answer_attention_mask", | |
} | |
assert ( | |
dataset.tokenizer.decode(result["input_ids"], skip_special_tokens=True) | |
== "System:system 1" | |
"Prompt:prompt 1" | |
"Answer:answer 1" | |
"Prompt:prompt 2" | |
"Answer:answer 2" | |
"Prompt:prompt 3" | |
"Answer:answer 3" | |
) | |
assert ( | |
dataset.tokenizer.decode(result["prompt_input_ids"], skip_special_tokens=True) | |
== "System:system 1" | |
"Prompt:prompt 1" | |
"Answer:answer 1" | |
"Prompt:prompt 2" | |
"Answer:answer 2" | |
"Prompt:prompt 3" | |
"Answer:" | |
) | |
assert ( | |
dataset.tokenizer.decode(result["input_ids"], skip_special_tokens=False) | |
== "<|endoftext|>" * 475 + "System:system 1" | |
"<|endoftext|>" | |
"Prompt:prompt 1" | |
"<|endoftext|>" | |
"Answer:answer 1" | |
"<|endoftext|>" | |
"Prompt:prompt 2" | |
"<|endoftext|>" | |
"Answer:answer 2" | |
"<|endoftext|>" | |
"Prompt:prompt 3" | |
"<|endoftext|>" | |
"Answer:answer 3" | |
"<|endoftext|>" | |
) | |
assert result["input_ids"].shape == (513,) | |
assert result["prompt_input_ids"].shape == (513,) | |
def test_getitem_no_chaining(): | |
df = pd.DataFrame( | |
{ | |
"prompt": ["prompt 1", "prompt 2", "prompt 3"], | |
"answer": ["answer 1", "answer 2", "answer 3"], | |
"parent_id": [None, 0, 1], | |
"system": ["system 1", "system 2", "system 3"], | |
"id": [0, 1, 2], | |
} | |
) | |
cfg = ConfigProblemBase( | |
dataset=ConfigNLPCausalLMDataset( | |
prompt_column=("prompt",), | |
answer_column="answer", | |
parent_id_column="None", | |
system_column="system", | |
text_system_start="System:", | |
text_prompt_start="Prompt:", | |
text_answer_separator="Answer:", | |
add_eos_token_to_answer=True, | |
), | |
tokenizer=ConfigNLPCausalLMTokenizer(max_length=513), | |
) | |
cfg.llm_backbone = "EleutherAI/pythia-2.8b-deduped" | |
dataset = CustomDataset(df, cfg) | |
assert len(dataset) == 3 | |
for i in range(3): | |
result = dataset[i] | |
assert isinstance(result, dict) | |
assert set(result.keys()) == { | |
"labels", | |
"input_ids", | |
"attention_mask", | |
"prompt_input_ids", | |
"prompt_attention_mask", | |
"answer_input_ids", | |
"answer_attention_mask", | |
} | |
assert ( | |
dataset.tokenizer.decode(result["input_ids"], skip_special_tokens=True) | |
== f"System:system {i+1}" | |
f"Prompt:prompt {i+1}" | |
f"Answer:answer {i+1}" | |
) | |
assert ( | |
dataset.tokenizer.decode( | |
result["prompt_input_ids"], skip_special_tokens=True | |
) | |
== f"System:system {i+1}" | |
f"Prompt:prompt {i+1}" | |
"Answer:" | |
) | |
def test_encode(): | |
df = pd.DataFrame( | |
{ | |
"prompt": ["a", "a"], | |
"answer": ["b", "b"], | |
"parent_id": [None, 0], | |
"id": [0, 1], | |
} | |
) | |
cfg = ConfigProblemBase( | |
dataset=ConfigNLPCausalLMDataset( | |
prompt_column=("prompt",), | |
answer_column="answer", | |
parent_id_column="parent_id", | |
text_prompt_start="<|prompt|>", | |
text_answer_separator="<|answer|>", | |
add_eos_token_to_answer=True, | |
limit_chained_samples=True, | |
), | |
tokenizer=ConfigNLPCausalLMTokenizer( | |
max_length=64, | |
tokenizer_kwargs='{"use_fast": true, "add_prefix_space": false}', | |
), | |
) | |
cfg.llm_backbone = "h2oai/h2o-danube2-1.8b-base" | |
dataset = CustomDataset(df, cfg) | |
assert len(dataset) == 1 | |
result = dataset[0] | |
labels = result["labels"] | |
assert (labels != -100).sum() == 4 | |
out = dataset.tokenizer.decode(result["input_ids"]).replace("<unk>", "") | |
assert out == "<|prompt|>a</s><|answer|>b</s><|prompt|>a</s><|answer|>b</s>" | |
def test_encode_maxlength(): | |
df = pd.DataFrame( | |
{ | |
"prompt": ["a", "a"], | |
"answer": ["b", "a b"], | |
"parent_id": [None, 0], | |
"id": [0, 1], | |
} | |
) | |
cfg = ConfigProblemBase( | |
dataset=ConfigNLPCausalLMDataset( | |
prompt_column=("prompt",), | |
answer_column="answer", | |
parent_id_column="parent_id", | |
text_prompt_start="<|prompt|>", | |
text_answer_separator="<|answer|>", | |
add_eos_token_to_answer=True, | |
limit_chained_samples=True, | |
), | |
tokenizer=ConfigNLPCausalLMTokenizer( | |
max_length=2, | |
tokenizer_kwargs='{"use_fast": true, "add_prefix_space": false}', | |
), | |
) | |
cfg.llm_backbone = "h2oai/h2o-danube2-1.8b-base" | |
dataset = CustomDataset(df, cfg) | |
assert len(dataset) == 1 | |
result = dataset[0] | |
out = dataset.tokenizer.decode(result["input_ids"]).replace("<unk>", "") | |
assert out == "a b" | |
def test_preprocess_dataframe_personalize(): | |
df = pd.DataFrame( | |
{ | |
"prompt": ["Open Assistant", "a"], | |
"answer": ["b", "LAION b"], | |
"parent_id": [None, 0], | |
"id": [0, 1], | |
} | |
) | |
cfg = ConfigProblemBase( | |
dataset=ConfigNLPCausalLMDataset( | |
prompt_column=("prompt",), | |
answer_column="answer", | |
parent_id_column="parent_id", | |
chatbot_author="H2O.ai", | |
chatbot_name="Danube", | |
personalize=True, | |
), | |
) | |
cfg.llm_backbone = "h2oai/h2o-danube2-1.8b-base" | |
assert df["prompt"].str.contains("Open Assistant").any() | |
assert df["answer"].str.contains("LAION").any() | |
dataset = CustomDataset(df, cfg) | |
df = dataset.preprocess_dataframe(df, cfg) | |
assert df["prompt"].str.contains("Danube").any() | |
assert df["answer"].str.contains("H2O.ai").any() | |
def test_preprocess_dataframe_no_personalize(): | |
df = pd.DataFrame( | |
{ | |
"prompt": ["Open Assistant", "a"], | |
"answer": ["b", "LAION b"], | |
"parent_id": [None, 0], | |
"id": [0, 1], | |
} | |
) | |
cfg = ConfigProblemBase( | |
dataset=ConfigNLPCausalLMDataset( | |
prompt_column=("prompt",), | |
answer_column="answer", | |
parent_id_column="parent_id", | |
chatbot_author="H2O.ai", | |
chatbot_name="Danube", | |
personalize=False, | |
), | |
) | |
cfg.llm_backbone = "h2oai/h2o-danube2-1.8b-base" | |
assert df["prompt"].str.contains("Open Assistant").any() | |
assert df["answer"].str.contains("LAION").any() | |
dataset = CustomDataset(df, cfg) | |
df_processed = dataset.preprocess_dataframe(df.copy(), cfg) | |
assert df_processed["prompt"].str.contains("Open Assistant").any() | |
assert df_processed["answer"].str.contains("LAION").any() | |
assert not df_processed["prompt"].str.contains("Danube").any() | |
assert not df_processed["answer"].str.contains("H2O.ai").any() | |
assert df_processed.equals(df) | |