{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "markdown", "source": [ "# nanoBERT Example\n", "\n", "Here we present nanoBERT, a nanobody-specific transformer. Its primary application is positing infilling, predicting what amino acids could be available at a given position according to the nanobody-specific distribution. " ], "metadata": { "id": "JU2dnhr24egK" } }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "gxL4QKeNqYXI", "outputId": "256d9b91-ed93-462a-8d6f-8c257b973f91" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (4.34.1)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.12.4)\n", "Requirement already satisfied: huggingface-hub<1.0,>=0.16.4 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.17.3)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (1.23.5)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (6.0.1)\n", "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers) (2023.6.3)\n", "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers) (2.31.0)\n", "Requirement already satisfied: tokenizers<0.15,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.14.1)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers) (0.4.0)\n", "Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.10/dist-packages (from transformers) (4.66.1)\n", "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (2023.6.0)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub<1.0,>=0.16.4->transformers) (4.5.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.3.0)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers) (2023.7.22)\n" ] } ], "source": [ "# Install stadard library\n", "! pip install --upgrade transformers" ] }, { "cell_type": "code", "source": [ "from transformers import pipeline, RobertaTokenizer, AutoModel" ], "metadata": { "id": "vG5ndbr_rYjL" }, "execution_count": 10, "outputs": [] }, { "cell_type": "code", "source": [ "# Initialise the tokenizer\n", "tokenizer = RobertaTokenizer.from_pretrained(\"NaturalAntibody/nanoBERT\", return_tensors=\"pt\")" ], "metadata": { "id": "1GNqH8HlrzmF" }, "execution_count": 11, "outputs": [] }, { "cell_type": "code", "source": [ "# Initialise model\n", "unmasker = pipeline('fill-mask', model=\"tadsatlawa/nanoBERT\", tokenizer=tokenizer, top_k=20 )" ], "metadata": { "id": "3CYcwIOU3xCY" }, "execution_count": 12, "outputs": [] }, { "cell_type": "code", "source": [ "# Predict the residue probability at one or more masked positions\n", "# mark position to predict with ''\n", "seq = \"QLVSGPEVKKPGASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCATNWGSYFEHWGQGTLVTVSS\"\n", "\n", "residueProbability = unmasker(seq)\n", "\n", "# Print residue probabilities\n", "for scores in residueProbability:\n", " print(f\"Amino Acid : {scores['token_str']}, probability = {scores['score']}\")" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6rtUxgbYsygY", "outputId": "da127f6a-e076-44ba-fce8-ff68c06cf354" }, "execution_count": 13, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Amino Acid : S, probability = 0.4827525019645691\n", "Amino Acid : A, probability = 0.22524100542068481\n", "Amino Acid : N, probability = 0.09490441530942917\n", "Amino Acid : Y, probability = 0.07571367919445038\n", "Amino Acid : K, probability = 0.04161035269498825\n", "Amino Acid : T, probability = 0.027568845078349113\n", "Amino Acid : H, probability = 0.009884347207844257\n", "Amino Acid : C, probability = 0.008951968513429165\n", "Amino Acid : V, probability = 0.007528781425207853\n", "Amino Acid : R, probability = 0.006156255956739187\n", "Amino Acid : G, probability = 0.005135924089699984\n", "Amino Acid : I, probability = 0.004699127282947302\n", "Amino Acid : W, probability = 0.0030531329102814198\n", "Amino Acid : M, probability = 0.0022762243170291185\n", "Amino Acid : F, probability = 0.001321254065260291\n", "Amino Acid : E, probability = 0.0009838981786742806\n", "Amino Acid : L, probability = 0.0006674979231320322\n", "Amino Acid : D, probability = 0.000666878477204591\n", "Amino Acid : Q, probability = 0.0005539602716453373\n", "Amino Acid : P, probability = 0.00032376404851675034\n" ] } ] } ] }