File size: 4,070 Bytes
582663d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/Anastasia/ds_bootcamp/.elbrus2/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"from transformers import BertTokenizer, BertModel\n",
"import torch\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"import numpy as np\n",
"import time\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"(array([ 5517, 9066, 13361, 11717, 320, 10793, 14201, 9305, 9199,\n",
" 8294]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))\n",
"3.533276081085205\n"
]
}
],
"source": [
"start_time = time.time()\n",
"\n",
"\n",
"# Читаем вектора сериалов\n",
"embeddings = np.loadtxt('data/embs.txt')\n",
"# Указываем пути к сохраненным модели и токенизатору\n",
"model_path = \"model\"\n",
"tokenizer_path = \"tokenizer\"\n",
"\n",
"# Загружаем модель\n",
"loaded_model = BertModel.from_pretrained(model_path)\n",
"\n",
"# Загружаем токенизатор\n",
"loaded_tokenizer = BertTokenizer.from_pretrained(tokenizer_path)\n",
"\n",
"\n",
"# Векторизуем запрос\n",
"loaded_model.eval()\n",
"tokens = loaded_tokenizer('петух закукарекал', return_tensors=\"pt\", padding=True, truncation=True)\n",
"\n",
"# Переместите токены на тот же устройство, что и модель\n",
"tokens = {key: value.to(loaded_model.device) for key, value in tokens.items()}\n",
"\n",
"# Передача токенов в модель для получения эмбеддингов\n",
"with torch.no_grad():\n",
" output = loaded_model(**tokens)\n",
"\n",
"# Эмбеддинги получаются из последнего скрытого состояния\n",
"user_embedding = output.last_hidden_state.mean(dim=1).squeeze().cpu().detach().numpy()\n",
"\n",
"\n",
"\n",
"cosine_similarities = cosine_similarity(embeddings, user_embedding.reshape(1, -1))\n",
"\n",
"# Получаем 10 наиболее подходящих строк-индексов в массиве нампай\n",
"top_10_indices = np.unravel_index(np.argsort(cosine_similarities, axis=None)[-10:], cosine_similarities.shape)\n",
"print(top_10_indices)\n",
"end_time = time.time()\n",
"print(end_time-start_time)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[5517, 9066, 13361, 11717, 320, 10793, 14201, 9305, 9199, 8294]"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"list(top_10_indices[0])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".elbrus2",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
|