{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "274e6135-2d97-4244-9183-65bcb1d24c80", "metadata": {}, "outputs": [], "source": [ "# Use the trained astroBERT model to generate embedings of text\n", "# to be used for downstream tasks" ] }, { "cell_type": "markdown", "id": "2cc88ed3-6f52-49a2-99c0-344387758ab5", "metadata": {}, "source": [ "# Tutorial 0: Loading astroBERT to produce text embeddings\n", "This tutorial will show you how to load astroBERT and produce text embeddings that can be used on downstream tasks." ] }, { "cell_type": "code", "execution_count": 2, "id": "9e65c041-9d66-4fb1-96b9-4937000da02e", "metadata": {}, "outputs": [], "source": [ "# 1 - load models and tokenizer" ] }, { "cell_type": "code", "execution_count": 3, "id": "67d99e96-c532-49ef-8542-a48eef818956", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2022-10-20 16:07:24.705905: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0\n" ] } ], "source": [ "from transformers import AutoTokenizer, AutoModel" ] }, { "cell_type": "code", "execution_count": 4, "id": "00e1d48e-9898-44ef-b00e-43e3ab7fed7d", "metadata": {}, "outputs": [], "source": [ "# the model path can either be the name of the Huggingface repository\n", "remote_model_path = 'adsabs/astroBERT'\n", "# or the local path to the directory containing model weight and tokenizer vocab\n", "local_model_path = '../'" ] }, { "cell_type": "code", "execution_count": 5, "id": "9bcc6009-6009-463f-a7da-f010c5fae27e", "metadata": {}, "outputs": [], "source": [ "# make sure you load the tokenier with do_lower_case=False\n", "astroBERT_tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=remote_model_path,\n", " use_auth_token=True,\n", " add_special_tokens=True,\n", " do_lower_case=False,\n", " )" ] }, { "cell_type": "code", "execution_count": 6, "id": "dbd144f0-6038-4917-94b0-aea9da72cac5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "PreTrainedTokenizerFast(name_or_path='adsabs/astroBERT', vocab_size=30000, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "astroBERT_tokenizer" ] }, { "cell_type": "code", "execution_count": 7, "id": "dd9a9257-cbe4-4908-a9f4-8e1431dc375a", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at adsabs/astroBERT were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight']\n", "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n" ] } ], "source": [ "# automodels: defaults to BertModel\n", "# it's normal to get warnings as a BertModel will not load the weights used for PreTraining\n", "astroBERT_automodel = AutoModel.from_pretrained(remote_model_path, \n", " use_auth_token=True,\n", " )" ] }, { "cell_type": "code", "execution_count": 8, "id": "572ddd38-a0dc-4583-a5a6-c4f3b2cb2553", "metadata": {}, "outputs": [], "source": [ "# 2 - make some inference, the outputs are the embeddings" ] }, { "cell_type": "code", "execution_count": 9, "id": "32fc0b97-4a2d-42ab-aa83-f5d8b39672b1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([3, 54])\n" ] } ], "source": [ "# list of strings for which we want embeddings\n", "strings = ['The Chandra X-ray Observatory (CXO), previously known as the Advanced X-ray Astrophysics Facility (AXAF), is a Flagship-class space telescope launched aboard the Space Shuttle Columbia during STS-93 by NASA on July 23, 1999.',\n", " 'Independent lines of evidence from Type Ia supernovae and the CMB imply that the universe today is dominated by a mysterious form of energy known as dark energy, which appears to homogeneously permeate all of space.',\n", " 'This work has been developed in the framework of the ‘Darklight’ programme, supported by the European Research Council through an Advanced Research Grant to LG (Project # 291521).'\n", " ]\n", "\n", "# tokenizer the strings, with padding (needed to process multiple strings efficiently)\n", "inputs = astroBERT_tokenizer(strings, \n", " padding=True, \n", " return_tensors='pt'\n", " )\n", "\n", "# check the shape of the inputs\n", "print(inputs['input_ids'].shape)" ] }, { "cell_type": "code", "execution_count": 10, "id": "8b7c9456-573a-48e7-9bc2-839fcc25631d", "metadata": {}, "outputs": [], "source": [ "# pass the inputs through astroBERT\n", "import torch\n", "# no need for gradients, since we are only doing inference\n", "with torch.no_grad():\n", " output = astroBERT_automodel(**inputs, \n", " output_hidden_states=False\n", " ) " ] }, { "cell_type": "code", "execution_count": 11, "id": "116de57a-bb31-48d7-9556-64e01a16d56f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([3, 54, 768])\n" ] } ], "source": [ "# BertModel outputs two tensors: last_hidden_state (our embeddings) and pooler_output (to be discarded as it's not meaningful)\n", "# see https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel.forward\n", "# embeddings will have shape = (# of strings, size of tokenized strings(padded), 768 (BERT embedding size))\n", "embeddings = output[0]\n", "print(embeddings.shape)" ] }, { "cell_type": "code", "execution_count": 12, "id": "38e45291-6fd7-48cf-83df-e1cc5c8a699f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[ 0.5546, 0.9121, 0.6550, ..., -0.1925, 0.7077, -0.2405],\n", " [ 0.6252, 0.3175, 1.0899, ..., 0.0576, 0.0529, 0.0603],\n", " [ 0.1803, -0.4567, 1.2688, ..., 0.6026, -0.5718, -0.2060],\n", " ...,\n", " [-0.4397, -0.5334, 1.1682, ..., 0.9541, 0.4046, -0.4756],\n", " [-0.3911, 0.7793, 0.2432, ..., 0.2268, -1.0489, -1.4864],\n", " [-0.4529, -0.7346, 0.0675, ..., -0.3246, -0.2333, -0.6154]])\n" ] } ], "source": [ "print(embeddings[0])" ] }, { "cell_type": "code", "execution_count": 13, "id": "26acf89f-b7fc-4872-ac81-0ee65030b465", "metadata": {}, "outputs": [], "source": [ "# If you wish to use the hidden states as additional embeddings, you can use output_hidden_states=True\n", "\n", "# no need for gradients, since we are only doing inference\n", "with torch.no_grad():\n", " output = astroBERT_automodel(**inputs, \n", " output_hidden_states=True\n", " ) " ] }, { "cell_type": "code", "execution_count": 14, "id": "a54314e9-5dcb-4c10-b0d2-219a93c7d16e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "13\n", "torch.Size([3, 54, 768])\n" ] } ], "source": [ "# This will produce 13 embeddings, one for each hidden layer\n", "embeddings = output[2]\n", "print(len(embeddings))\n", "print(embeddings[0].shape)" ] }, { "cell_type": "code", "execution_count": null, "id": "76765dcb-8035-44b2-a5a3-db181b561095", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.5" } }, "nbformat": 4, "nbformat_minor": 5 }