Spaces:

ronaldahmed
/

local_coh_ppl

Runtime error

App Files Files Community

ronald commited on Mar 29, 2023

Commit

a958a99

1 Parent(s): b2af956

init

Browse files

Files changed (1) hide show

local_coh_ppl.py +24 -7

local_coh_ppl.py CHANGED Viewed

@@ -165,7 +165,14 @@ class LocalCohPPL(evaluate.Measurement):
             tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
         model.config.max_length = 512 if "scibert" in model_id else model.config.max_length
-        max_tokenized_len = model.config.max_length - 1
         loss_fct = CrossEntropyLoss(reduction="none")
@@ -187,11 +194,21 @@ class LocalCohPPL(evaluate.Measurement):
             encoded_texts = encodings["input_ids"]
             attn_masks = encodings["attention_mask"]
-            bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_texts.size(dim=0)).to(device)
-            encoded_texts = torch.cat([bos_tokens_tensor, encoded_texts], dim=1)
-            attn_masks = torch.cat(
-                [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_masks], dim=1
-            )
             # tokenize by sentence
             for pred in batch_sents:
@@ -210,7 +227,7 @@ class LocalCohPPL(evaluate.Measurement):
             labels = encoded_texts
             with torch.no_grad():
-                out_logits = model(encoded_batch, attention_mask=attn_mask).logits
             shift_logits = out_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()

             tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
         model.config.max_length = 512 if "scibert" in model_id else model.config.max_length
+        if add_start_token:
+            # leave room for <BOS> token to be added:
+            assert (
+                tokenizer.bos_token is not None
+            ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
+            max_tokenized_len = model.config.max_length - 1
+        else:
+            max_tokenized_len = model.config.max_length
         loss_fct = CrossEntropyLoss(reduction="none")
             encoded_texts = encodings["input_ids"]
             attn_masks = encodings["attention_mask"]
+            # check that each input is long enough:
+            if add_start_token:
+                assert torch.all(torch.ge(attn_masks.sum(1), 1)), "Each input text must be at least one token long."
+            else:
+                assert torch.all(
+                    torch.ge(attn_masks.sum(1), 2)
+                ), "When add_start_token=False, each input text must be at least two tokens long. Run with add_start_token=True if inputting strings of only one token, and remove all empty input strings."
+            if add_start_token:
+                bos_tokens_tensor = torch.tensor([[tokenizer.bos_token_id]] * encoded_texts.size(dim=0)).to(device)
+                encoded_texts = torch.cat([bos_tokens_tensor, encoded_texts], dim=1)
+                attn_masks = torch.cat(
+                    [torch.ones(bos_tokens_tensor.size(), dtype=torch.int64).to(device), attn_masks], dim=1
+                )
             # tokenize by sentence
             for pred in batch_sents:
             labels = encoded_texts
             with torch.no_grad():
+                out_logits = model(encoded_texts, attention_mask=attn_masks).logits
             shift_logits = out_logits[..., :-1, :].contiguous()
             shift_labels = labels[..., 1:].contiguous()