# nanoBERT Example

Here we present nanoBERT, a nanobody-specific transformer. Its primary application is positing infilling, predicting what amino acids could be available at a given position according to the nanobody-specific distribution.  

In [1]:
# Install stadard library
! pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.9 MB/s[0m eta [36m0:00:00[0m
Col

In [2]:
from transformers import pipeline, RobertaTokenizer, AutoModel

In [3]:
# Initialise the tokenizer
tokenizer = RobertaTokenizer.from_pretrained("NaturalAntibody/nanoBERT", return_tensors="pt")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/285 [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

In [4]:
# Initialise model
unmasker = pipeline('fill-mask', model="NaturalAntibody/nanoBERT", tokenizer=tokenizer, top_k=20 )

Downloading pytorch_model.bin:   0%|          | 0.00/57.9M [00:00<?, ?B/s]

In [5]:
# Predict the residue probability at one or more masked positions
# mark position to predict with '<mask>'
seq = "QLVSGPEVKKPGASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYC<mask>ATNWGSYFEHWGQGTLVTVSS"

residueProbability = unmasker(seq)

# Print residue probabilities
for scores in residueProbability:
    print(f"Amino Acid : {scores['token_str']}, probability = {scores['score']}")

Amino Acid : T, probability = 0.28031080961227417
Amino Acid : A, probability = 0.23146940767765045
Amino Acid : K, probability = 0.13046956062316895
Amino Acid : N, probability = 0.12727046012878418
Amino Acid : V, probability = 0.06813304871320724
Amino Acid : S, probability = 0.040362704545259476
Amino Acid : R, probability = 0.02661576308310032
Amino Acid : G, probability = 0.02190057747066021
Amino Acid : I, probability = 0.011736654676496983
Amino Acid : H, probability = 0.011689980514347553
Amino Acid : Y, probability = 0.010142817161977291
Amino Acid : E, probability = 0.007347866427153349
Amino Acid : M, probability = 0.0065687913447618484
Amino Acid : L, probability = 0.00589130399748683
Amino Acid : P, probability = 0.0055612483993172646
Amino Acid : Q, probability = 0.004960155114531517
Amino Acid : D, probability = 0.003958782181143761
Amino Acid : F, probability = 0.0026921925600618124
Amino Acid : C, probability = 0.0016993506578728557
Amino Acid : W, probability = 0.000