File size: 9,818 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "274e6135-2d97-4244-9183-65bcb1d24c80",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Use the trained astroBERT model to generate embedings of text\n",
    "# to be used for downstream tasks"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2cc88ed3-6f52-49a2-99c0-344387758ab5",
   "metadata": {},
   "source": [
    "# Tutorial 0: Loading astroBERT to produce text embeddings\n",
    "This tutorial will show you how to load astroBERT and produce text embeddings that can be used on downstream tasks."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "9e65c041-9d66-4fb1-96b9-4937000da02e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 1 - load models and tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "67d99e96-c532-49ef-8542-a48eef818956",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2022-10-31 11:29:32.372654: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer, AutoModel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "00e1d48e-9898-44ef-b00e-43e3ab7fed7d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# the model path can either be the name of the Huggingface repository\n",
    "remote_model_path = 'adsabs/astroBERT'\n",
    "# or the local path to the directory containing model weight and tokenizer vocab\n",
    "local_model_path = '../'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "9bcc6009-6009-463f-a7da-f010c5fae27e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# make sure you load the tokenier with do_lower_case=False\n",
    "astroBERT_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=remote_model_path,\n",
    "                                                    use_auth_token=True,\n",
    "                                                    add_special_tokens=True,\n",
    "                                                    do_lower_case=False,\n",
    "                                                   )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "dbd144f0-6038-4917-94b0-aea9da72cac5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "PreTrainedTokenizerFast(name_or_path='adsabs/astroBERT', vocab_size=30000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "astroBERT_tokenizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "dd9a9257-cbe4-4908-a9f4-8e1431dc375a",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at adsabs/astroBERT were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']\n",
      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "# automodels: defaults to BertModel\n",
    "# it's normal to get warnings as a BertModel will not load the weights used for PreTraining\n",
    "astroBERT_automodel = AutoModel.from_pretrained(remote_model_path, \n",
    "                                                use_auth_token=True,\n",
    "                                               )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "572ddd38-a0dc-4583-a5a6-c4f3b2cb2553",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 2 - make some inference, the outputs are the embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "32fc0b97-4a2d-42ab-aa83-f5d8b39672b1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([3, 54])\n"
     ]
    }
   ],
   "source": [
    "# list of strings for which we want embeddings\n",
    "strings = ['The Chandra X-ray Observatory (CXO), previously known as the Advanced X-ray Astrophysics Facility (AXAF), is a Flagship-class space telescope launched aboard the Space Shuttle Columbia during STS-93 by NASA on July 23, 1999.',\n",
    "           'Independent lines of evidence from Type Ia supernovae and the CMB imply that the universe today is dominated by a mysterious form of energy known as dark energy, which appears to homogeneously permeate all of space.',\n",
    "           'This work has been developed in the framework of the ‘Darklight’ programme, supported by the European Research Council through an Advanced Research Grant to LG (Project # 291521).'\n",
    "          ]\n",
    "\n",
    "# tokenizer the strings, with padding (needed to process multiple strings efficiently)\n",
    "inputs = astroBERT_tokenizer(strings, \n",
    "                             padding=True, \n",
    "                             return_tensors='pt'\n",
    "                            )\n",
    "\n",
    "# check the shape of the inputs\n",
    "print(inputs['input_ids'].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "8b7c9456-573a-48e7-9bc2-839fcc25631d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pass the inputs through astroBERT\n",
    "import torch\n",
    "# no need for gradients, since we are only doing inference\n",
    "with torch.no_grad():\n",
    "    output = astroBERT_automodel(**inputs, \n",
    "                                 output_hidden_states=False\n",
    "                                )  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "116de57a-bb31-48d7-9556-64e01a16d56f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([3, 54, 768])\n"
     ]
    }
   ],
   "source": [
    "# BertModel outputs two tensors: last_hidden_state (our embeddings) and pooler_output (to be discarded as it's not meaningful)\n",
    "# see https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel.forward\n",
    "# embeddings will have shape = (# of strings, size of tokenized strings(padded), 768 (BERT embedding size))\n",
    "embeddings = output[0]\n",
    "print(embeddings.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "38e45291-6fd7-48cf-83df-e1cc5c8a699f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[ 0.5546,  0.9121,  0.6550,  ..., -0.1925,  0.7077, -0.2405],\n",
      "        [ 0.6252,  0.3175,  1.0899,  ...,  0.0576,  0.0529,  0.0603],\n",
      "        [ 0.1803, -0.4567,  1.2688,  ...,  0.6026, -0.5718, -0.2060],\n",
      "        ...,\n",
      "        [-0.4397, -0.5334,  1.1682,  ...,  0.9541,  0.4046, -0.4756],\n",
      "        [-0.3911,  0.7793,  0.2432,  ...,  0.2268, -1.0489, -1.4864],\n",
      "        [-0.4529, -0.7346,  0.0675,  ..., -0.3246, -0.2333, -0.6154]])\n"
     ]
    }
   ],
   "source": [
    "print(embeddings[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "26acf89f-b7fc-4872-ac81-0ee65030b465",
   "metadata": {},
   "outputs": [],
   "source": [
    "# If you wish to use the hidden states as additional embeddings, you can use output_hidden_states=True\n",
    "\n",
    "# no need for gradients, since we are only doing inference\n",
    "with torch.no_grad():\n",
    "    output = astroBERT_automodel(**inputs, \n",
    "                                 output_hidden_states=True\n",
    "                                )  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "a54314e9-5dcb-4c10-b0d2-219a93c7d16e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13\n",
      "torch.Size([3, 54, 768])\n"
     ]
    }
   ],
   "source": [
    "# This will produce 13 embeddings, one for each hidden layer\n",
    "embeddings = output[2]\n",
    "print(len(embeddings))\n",
    "print(embeddings[0].shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "76765dcb-8035-44b2-a5a3-db181b561095",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}