Text Generation
Transformers
PyTorch
Safetensors
bloom
text-generation-inference

Issue with step 400_000

#3
by bayazitdeniz - opened

Hello BLOOM team! I wanted to thank you for releasing these intermediate checkpoints, it is super useful to interpret pretraining dynamics, and there is rarely such resources for multiligual models.

I wanted to ask you what might be happening with checkpoint step 400_000. I first noticed on a subject verb agreement task for English and French that the performance was dropping to random at this checkpoint (worse than step 1_000). Then to verify this behavior I did a quick LModeling evaluation of each checkpoint on a subset of wikitext. I will copy paste the code as a follow up to the discussion. But here is the perplexity result I get per step:

{1000: 299.0409986707899,
 10000: 53.00332175360786,
 100000: 34.277858310275604,
 200000: 32.3691151936849,
 300000: 30.913375430636936,
 400000: 338341.52430555556, # <------ very high
 500000: 28.414903428819443,
 600000: 26.762985229492188}

I am wondering if by any chance you may have accidentally uploaded the initial model at step 0 for step 400_000, since the performance is even worse than step 1_000.

Thank you for the help!

All the best,
Deniz

from itertools import chain
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader, Subset
from transformers import AutoModelForCausalLM, AutoTokenizer, default_data_collator
import datasets

def group_texts(examples, block_size):
    # Concatenate all inputs
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # Drop the small remainder
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block size
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    # Copy the same for labels
    result["labels"] = result["input_ids"].copy()
    return result

def tokenize_and_chunk(
        wikitext_dataset,
        column_names,
        tokenizer,
        block_size=None
    ):
    text_column_name = "text" if "text" in column_names else column_names[0]
    
    # 1) Decide on block size depending on model context length if None is given
    if block_size is None:
        block_size = tokenizer.model_max_length
        if block_size > 1024:
            block_size = 1024
    else:
        block_size = min(block_size, tokenizer.model_max_length)
    
    # 2) Tokenize texts
    def tokenize_function(examples):
        return tokenizer(examples[text_column_name])

    tokenized_datasets = wikitext_dataset.map(
        tokenize_function,
        batched=True,
        num_proc=1,
        remove_columns=column_names,
        desc="Running tokenizer on ControlLM dataset"
    )

    # 3) Chunk token IDs into blocks
    controllm_datasets = tokenized_datasets.map(
        lambda examples: group_texts(
            examples=examples, 
            block_size=block_size
        ),
        batched=True,
        num_proc=1,
        desc=f"Grouping texts in chunks of {block_size}"
    )
    
    return controllm_datasets

def load_wikitext2_test_dataloader(
    lm_name: str,
    block_size=512,
    controllm_eval_batch_size: int = 8
):
    # 1) Load the dataset
    wikitext_dataset = datasets.load_dataset("wikitext", "wikitext-2-raw-v1", split="test[:10%]")
    # print(wikitext_dataset.info.version)
    
    # 2) Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(lm_name, use_fast=True)
    tokenizer.pad_token = tokenizer.eos_token
    column_names = list(wikitext_dataset.features)
    
    # 3) Tokenize and chunk
    lm_datasets = tokenize_and_chunk(
        wikitext_dataset,
        column_names,
        tokenizer,
        block_size
    )
    lm_datasets.tokenizer = tokenizer
    
    print("Test ControlLM len: ", len(lm_datasets))

    # 4) Create test dataloader
    test_dataloader = DataLoader(
        lm_datasets, 
        shuffle=False,
        collate_fn=default_data_collator, 
        batch_size=controllm_eval_batch_size
    )

    return test_dataloader



@torch
	.no_grad()
def get_wiki_ppl(
    model, 
    wiki_dataloader: DataLoader, 
):
    sum_ppl = 0.0
    cnt = 0
    for batch in wiki_dataloader:
        batch["input_ids"] = batch["input_ids"].cuda().detach()
        batch["attention_mask"] = batch["attention_mask"].cuda().detach()
        tok_labels = batch["labels"].cuda().detach()

        output = model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=tok_labels)
        loss = output.loss.detach()
        
        sum_ppl += torch.exp(loss).item()
        cnt += 1

    return sum_ppl / float(cnt)


dataloader = load_wikitext2_test_dataloader(lm_name="bigscience/bloom-1b1")
steps = [1_000, 10_000, 100_000, 200_000, 300_000, 400_000, 500_000, 600_000]
resdict = {}
for step in tqdm(steps, desc="steps"):
    rev = f"global_step{step}"
    model = AutoModelForCausalLM.from_pretrained(
        "bigscience/bloom-1b1-intermediate",
        revision=rev
    )
    model = model.cuda()
    ppl = get_wiki_ppl(model, dataloader)
    resdict[step] = ppl

print(resdict)

# Output resdict:
# {1000: 299.0409986707899,
#  10000: 53.00332175360786,
#  100000: 34.277858310275604,
#  200000: 32.3691151936849,
#  300000: 30.913375430636936,
#  400000: 338341.52430555556, <------ very high
#  500000: 28.414903428819443,
#  600000: 26.762985229492188}
BigScience Workshop org

Thanks for raising this! I doubt that 400K would be step 0 but rather maybe there was some corruption that happened during checkpoint conversion or saving and it could have been converted into random weights for that checkpoint. Sorry for this issue!

Thank you for the clarification! I will exclude it from my analyses then.

christopher changed discussion status to closed

Sign up or log in to comment