formermagic
/

pyt5-base

@@ -4,75 +4,13 @@ Pre-trained model on CodeSearchNet Python dataset using a span-masking objective
 # How to use
-You can use this model to denoise span-masked sequences. Note, that you'll need to add some boilerplate code for adding the noise to your sequences.
 First, install the [git-t5](https://github.com/formermagic/git-t5) pip package:
 ```shell
 > pip install git-t5
 ```
-Add the following code for encoding an input text:
-```python
-from typing import Dict, Optional, Tuple
-import numpy as np
-import torch
-from transformers import PreTrainedTokenizerBase
-from git_t5.data import DataCollatorForT5MLM
-def encode(
-    tokenizer: PreTrainedTokenizerBase,
-    text: str,
-    noise_density: float = 0.15,
-    mean_noise_span_length: float = 3.0,
-    extra_tokens_per_span_inputs: int = 1,
-    extra_tokens_per_span_targets: int = 1,
-    seed: Optional[int] = None,
-) -> Tuple[Dict[str, torch.Tensor], int]:
-    def compute_lengths(tokens_length: int) -> Tuple[int, int]:
-        num_noise_tokens = int(round(tokens_length * noise_density))
-        num_nonnoise_tokens = tokens_length - num_noise_tokens
-        num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length))
-        # inputs contain all nonnoise tokens, sentinels for all noise spans
-        # and one EOS token.
-        return (
-            num_nonnoise_tokens + num_noise_spans * extra_tokens_per_span_inputs + 1,
-            num_noise_tokens + num_noise_spans * extra_tokens_per_span_targets + 1,
-        )
-    encoding = tokenizer(
-        text,
-        truncation=False,
-        return_attention_mask=False,
-        return_length=True,
-    )
-    input_length = encoding.pop("length")
-    input_length = input_length[0]
-    input_length, target_length = compute_lengths(input_length)
-    np.random.seed(seed)
-    data_collator = DataCollatorForT5MLM(
-        tokenizer=tokenizer,
-        noise_density=noise_density,
-        mean_noise_span_length=mean_noise_span_length,
-        input_length=input_length,
-        target_length=target_length,
-        eos_token_id=tokenizer.eos_token_id,
-        pad_token_id=tokenizer.pad_token_id,
-        decoder_start_token_id=tokenizer.pad_token_id,
-        sentinel_token_id=tokenizer.convert_tokens_to_ids("<extra_id_0>"),
-    )
-    batch = data_collator([encoding])  # type: ignore
-    batch = {key: torch.tensor(val) for key, val in batch.items()}
-    return batch, target_length
-```
 Next, download the model and tokenizer:
 ```python
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,
@@ -84,6 +22,8 @@ tokenizer = AutoTokenizer.from_pretrained("formermagic/pyt5-base")
 Finally, encode your input and generate the output sequence:
 ```python
 text = """
 def alias(self, annotationtype, set, fallback=False):
     if inspect.isclass(annotationtype): annotationtype = annotationtype.ANNOTATIONTYPE
@@ -95,7 +35,7 @@ def alias(self, annotationtype, set, fallback=False):
         raise KeyError("No alias for set " + set)
 """
-batch, max_length = encode(tokenizer, text, seed=22)
 outputs = model.generate(batch["input_ids"], max_length=max_length, num_beams=1)
 print(tokenizer.batch_decode(outputs[..., 1:]))
 print(tokenizer.batch_decode(batch["labels"]))

 # How to use
+You can use this model to denoise span-masked sequences.
 First, install the [git-t5](https://github.com/formermagic/git-t5) pip package:
 ```shell
 > pip install git-t5
 ```
 Next, download the model and tokenizer:
 ```python
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer,
 Finally, encode your input and generate the output sequence:
 ```python
+from git_t5.utils import encode_input
 text = """
 def alias(self, annotationtype, set, fallback=False):
     if inspect.isclass(annotationtype): annotationtype = annotationtype.ANNOTATIONTYPE
         raise KeyError("No alias for set " + set)
 """
+batch, max_length = encode_input(tokenizer, text, seed=22)
 outputs = model.generate(batch["input_ids"], max_length=max_length, num_beams=1)
 print(tokenizer.batch_decode(outputs[..., 1:]))
 print(tokenizer.batch_decode(batch["labels"]))