{ "cells": [ { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from transformers import BertTokenizer, BertModel\n", "import torch\n", "from pprint import pprint" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. \n", "The tokenizer class you load from this checkpoint is 'BertJapaneseTokenizer'. \n", "The class this function is called from is 'BertTokenizer'.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]]),\n", " 'input_ids': tensor([[ 2, 73, 371, 37, 1541, 546, 3]]),\n", " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0]])}\n", "torch.Size([1, 7, 768])\n" ] } ], "source": [ "# 日本語の事前学習済みモデルとトークナイザーの読み込み\n", "tokenizer = BertTokenizer.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')\n", "model = BertModel.from_pretrained('cl-tohoku/bert-base-japanese-whole-word-masking')\n", "\n", "# テキストをトークン化し、PyTorchテンソルに変換\n", "text = \"お正月休み\"\n", "encoded_input = tokenizer(text, return_tensors='pt')\n", "pprint(encoded_input)\n", "\n", "# 単語埋め込みを取得\n", "with torch.no_grad():\n", " output = model(**encoded_input)\n", " embeddings = output.last_hidden_state\n", " pprint(embeddings.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "chiikawa-yonezu", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 2 }