{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "82a54277",
   "metadata": {},
   "source": [
    "## RuGPT experiments\n",
    "\n",
    "> sberbank-ai/rugpt3large_based_on_gpt2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "01e5707d",
   "metadata": {},
   "source": [
    "1. Import and **unzip** texts.zip"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f08737b8-b09d-40fc-99a3-1b5d5d83988a",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "%pip install transformers\n",
    "%pip install accelerate==0.27.2"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a5b6fcc4",
   "metadata": {},
   "source": [
    "## Train on merged texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "bc03ca73-ff08-41b9-8ed7-22fcdda83e44",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T17:55:15.495910Z",
     "iopub.status.busy": "2024-04-09T17:55:15.494965Z",
     "iopub.status.idle": "2024-04-09T17:55:44.728620Z",
     "shell.execute_reply": "2024-04-09T17:55:44.727242Z",
     "shell.execute_reply.started": "2024-04-09T17:55:15.495872Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jupyter/.local/lib/python3.10/site-packages/transformers/utils/hub.py:124: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.\n",
      "  warnings.warn(\n",
      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
      "  return self.fget.__get__(instance, owner)()\n"
     ]
    }
   ],
   "source": [
    "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n",
    "import torch\n",
    "DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "\n",
    "model_name_or_path = \"sberbank-ai/rugpt3large_based_on_gpt2\"\n",
    "tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)\n",
    "model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(DEVICE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a601a11-b90a-48d6-9e88-52fb2147a354",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T15:53:05.748515Z",
     "iopub.status.busy": "2024-04-09T15:53:05.747521Z",
     "iopub.status.idle": "2024-04-09T15:53:05.761003Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "device(type='cuda')"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "DEVICE"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c00c999a",
   "metadata": {},
   "source": [
    "#### data preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "d5d17f92-c988-475d-b1e3-b7c878c123c7",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T17:55:44.732113Z",
     "iopub.status.busy": "2024-04-09T17:55:44.730633Z",
     "iopub.status.idle": "2024-04-09T17:55:59.272915Z",
     "shell.execute_reply": "2024-04-09T17:55:59.271784Z",
     "shell.execute_reply.started": "2024-04-09T17:55:44.732061Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-04-09 17:55:53.531394: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
      "/home/jupyter/.local/lib/python3.10/site-packages/transformers/data/datasets/language_modeling.py:53: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "from transformers import TextDataset, DataCollatorForLanguageModeling\n",
    "import os\n",
    "\n",
    "# Путь к папке с текстовыми файлами\n",
    "folder_path = '/home/jupyter/datasphere/project/texts/'\n",
    "\n",
    "# Создадим единый файл с обучающим текстом, содержащий все файлы из папки\n",
    "train_path = 'train_dataset.txt'\n",
    "with open(train_path, \"w\", encoding=\"utf-8\") as train_file:\n",
    "    for filename in os.listdir(folder_path):\n",
    "        if filename.endswith('.txt'):\n",
    "            file_path = os.path.join(folder_path, filename)\n",
    "            with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
    "                train_file.write(file.read() + '\\n')  # Разделяем тексты пустой строкой\n",
    "\n",
    "with open(train_path, 'r') as f:\n",
    "    lines = f.readlines()\n",
    "# Filter out empty lines\n",
    "non_empty_lines = [line for line in lines if line.strip()]\n",
    "\n",
    "train_path = 'train_dataset_stripped.txt'\n",
    "with open(train_path, 'w', encoding=\"utf-8\") as f:\n",
    "    f.writelines(non_empty_lines)\n",
    "\n",
    "\n",
    "def save_first_n_lines(input_file, output_file, n):\n",
    "    with open(input_file, 'r', encoding='utf-8') as f:\n",
    "        lines = f.readlines()[:n]\n",
    "\n",
    "    with open(output_file, 'w', encoding='utf-8') as f:\n",
    "        f.writelines(lines)\n",
    "\n",
    "input_file = train_path\n",
    "output_file = 'train_dataset_stripped1.txt'\n",
    "n = 480  # Здесь указывается количество строк, которые вы хотите оставить\n",
    "\n",
    "save_first_n_lines(input_file, output_file, n)\n",
    "train_path = output_file\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25375406",
   "metadata": {},
   "source": [
    "#### Dataset, Dataloader, Trainer ---- Train!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a5051a0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "    \n",
    "    \n",
    "# Создание датасета\n",
    "train_dataset = TextDataset(tokenizer=tokenizer,file_path=train_path,block_size=64)\n",
    "  \n",
    "# Создание даталодера (нарезает текст на оптимальные по длине куски)\n",
    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, \n",
    "                                                mlm=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "7b41c5af-28ef-4bf0-adf7-0f5503bc4aea",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T17:55:59.276218Z",
     "iopub.status.busy": "2024-04-09T17:55:59.274991Z",
     "iopub.status.idle": "2024-04-09T17:55:59.410517Z",
     "shell.execute_reply": "2024-04-09T17:55:59.409340Z",
     "shell.execute_reply.started": "2024-04-09T17:55:59.276164Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from transformers import Trainer, TrainingArguments\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./finetuned\", # The output directory\n",
    "    overwrite_output_dir=True, # Overwrite the content of the output dir\n",
    "    num_train_epochs=200, # number of training epochs\n",
    "    per_device_train_batch_size=32, # batch size for training\n",
    "    per_device_eval_batch_size=32,  # batch size for evaluation\n",
    "    warmup_steps=10, # number of warmup steps for learning rate scheduler\n",
    "    gradient_accumulation_steps=16, # to make \"virtual\" batch size larger\n",
    "    fp16=True,\n",
    "    )\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    data_collator=data_collator,\n",
    "    train_dataset=train_dataset,\n",
    "    optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5), None)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "1ef56378-45fd-4f5c-8204-d44fea4baa96",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T17:55:59.412736Z",
     "iopub.status.busy": "2024-04-09T17:55:59.411831Z",
     "iopub.status.idle": "2024-04-09T17:59:41.062238Z",
     "shell.execute_reply": "2024-04-09T17:59:41.061104Z",
     "shell.execute_reply.started": "2024-04-09T17:55:59.412699Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [03:41<00:00,  1.11s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 221.3393, 'train_samples_per_second': 65.058, 'train_steps_per_second': 0.904, 'train_loss': 0.2417168426513672, 'epoch': 178.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=200, training_loss=0.2417168426513672, metrics={'train_runtime': 221.3393, 'train_samples_per_second': 65.058, 'train_steps_per_second': 0.904, 'train_loss': 0.2417168426513672, 'epoch': 178.0})"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e20b26f0",
   "metadata": {},
   "source": [
    "#### eval() - для того, чтобы модель появилась в переменных нужного формата (?)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89bfaeee",
   "metadata": {},
   "outputs": [],
   "source": [
    "model.eval()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2c06a82e",
   "metadata": {},
   "source": [
    "### Saving model (pt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a965ca2c-f1fa-4731-a4e9-db7df64aa75c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T17:59:41.064580Z",
     "iopub.status.busy": "2024-04-09T17:59:41.063960Z",
     "iopub.status.idle": "2024-04-09T18:02:01.813359Z",
     "shell.execute_reply": "2024-04-09T18:02:01.811968Z",
     "shell.execute_reply.started": "2024-04-09T17:59:41.064530Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "file_path = \"model.pt\"\n",
    "\n",
    "# Сохраните модель\n",
    "torch.save(model.state_dict(), file_path)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b06ba374",
   "metadata": {},
   "source": [
    "### Loading model ----- НЕ работает("
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "33fe3fa1",
   "metadata": {},
   "outputs": [],
   "source": [
    "model_uploaded = GPT2LMHeadModel.from_pretrained('finetuned/model.safetensors').to(DEVICE)\n",
    "model_uploaded.eval()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "43f55935",
   "metadata": {},
   "source": [
    "#### Tests"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "643ceab4-60ee-42e4-ab2b-03c15815a2e2",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T18:02:01.815815Z",
     "iopub.status.busy": "2024-04-09T18:02:01.814948Z",
     "iopub.status.idle": "2024-04-09T18:02:08.140460Z",
     "shell.execute_reply": "2024-04-09T18:02:08.139380Z",
     "shell.execute_reply.started": "2024-04-09T18:02:01.815764Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Зафайнтюнил я модель,\n",
      "Закачал я драйвера,\n",
      "Установил я винду\n",
      "Куча мала плюсов...\n",
      "Но для тебя... увы\n",
      "Нет в арсенале меня\n",
      "А если бы был бы вдруг я,\n",
      "То ты бы всё равно комп\n",
      "Так что нет меня с тобой\n",
      "Ну что же мне делать тебе\n",
      "Да вот же он я за стеной\n",
      "Ты б хоть спросила кто я\n",
      "Да ладно что уж тут спорить\n",
      "Я Зафайнтюнил модель\n",
      "И поставил её на но....<s>\n",
      "У каждого есть свой особый взгляд \n",
      "И свои секреты в нём есть....\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#############\n",
    "text = \"Зафайнтюнил я модель\"\n",
    "input_ids = tokenizer.encode(text, return_tensors=\"pt\").to(DEVICE)\n",
    "\n",
    "\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    out = model.generate(input_ids,\n",
    "                        do_sample=True,\n",
    "                        num_beams=2,\n",
    "                        temperature=3.5,\n",
    "                        top_p=0.9,\n",
    "                        max_length=128,\n",
    "                        )\n",
    "\n",
    "generated_text = list(map(tokenizer.decode, out))[0]\n",
    "\n",
    "print(generated_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "67888b5f-3957-4e5b-b557-1c16a9cb245c",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T15:56:36.008311Z",
     "iopub.status.busy": "2024-04-09T15:56:36.007405Z",
     "iopub.status.idle": "2024-04-09T15:56:41.411536Z",
     "shell.execute_reply": "2024-04-09T15:56:41.410691Z",
     "shell.execute_reply.started": "2024-04-09T15:56:36.008260Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Зафайнтюнил я модель,\n",
      "Закачал я драйвера,\n",
      "Установил я винду\n",
      "Куча мала плюсов...\n",
      "Но для тебя... увы\n",
      "Нет в арсенале меня\n",
      "А если бы был бы вдруг я,\n",
      "То ты бы всё равно комп\n",
      "Так что нет меня с тобой\n",
      "Ну что же мне делать тебе\n",
      "Да вот же он я за стеной\n",
      "Ты б хоть спросила кто я\n",
      "Да ладно что уж тут спорить\n",
      "Я Зафайнтюнил модель\n",
      "И поставил её на но....<s>\n",
      "У каждого есть свой особый взгляд \n",
      "И свои секреты в нём есть....\n",
      "\n"
     ]
    }
   ],
   "source": [
    "text = \"Зафайнтюнил я модель\"\n",
    "input_ids = tokenizer.encode(text, return_tensors=\"pt\").to(DEVICE)\n",
    "\n",
    "\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    out = model.generate(input_ids,\n",
    "                        do_sample=True,\n",
    "                        num_beams=2,\n",
    "                        temperature=3.5,\n",
    "                        top_p=0.9,\n",
    "                        max_length=128,\n",
    "                        )\n",
    "\n",
    "generated_text = list(map(tokenizer.decode, out))[0]\n",
    "\n",
    "print(generated_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "024fa8b1",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T18:04:07.714964Z",
     "iopub.status.busy": "2024-04-09T18:04:07.713978Z",
     "iopub.status.idle": "2024-04-09T18:04:16.875218Z",
     "shell.execute_reply": "2024-04-09T18:04:16.874056Z",
     "shell.execute_reply.started": "2024-04-09T18:04:07.714918Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/home/jupyter/datasphere/project/texts/A9.txt\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jupyter/.local/lib/python3.10/site-packages/transformers/data/datasets/language_modeling.py:53: FutureWarning: This dataset will be removed from the library soon, preprocessing should be handled with the 🤗 Datasets library. You can have a look at this example script for pointers: https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_mlm.py\n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "file_paths = []\n",
    "\n",
    "# Путь к папке с текстовыми файлами\n",
    "folder_path = '/home/jupyter/datasphere/project/texts/'\n",
    "\n",
    "# Создадим единый файл с обучающим текстом, содержащий все файлы из папки\n",
    "train_path = 'train_dataset.txt'\n",
    "with open(train_path, \"w\", encoding=\"utf-8\") as train_file:\n",
    "    for filename in os.listdir(folder_path):\n",
    "        if filename.endswith('.txt') and filename.startswith('A') :\n",
    "            file_path = os.path.join(folder_path, filename)\n",
    "            \n",
    "            file_paths.append(file_path)\n",
    "                \n",
    "print(file_paths[-1])\n",
    "\n",
    "\n",
    "train_dataset = [TextDataset(tokenizer=tokenizer,file_path=file_paths[i],block_size=64) for i in range(len(file_paths))]\n",
    "  \n",
    "# Создание даталодера (нарезает текст на оптимальные по длине куски)\n",
    "data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, \n",
    "                                                mlm=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "7430c622",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T18:04:16.878075Z",
     "iopub.status.busy": "2024-04-09T18:04:16.877192Z",
     "iopub.status.idle": "2024-04-09T18:11:36.667812Z",
     "shell.execute_reply": "2024-04-09T18:11:36.666762Z",
     "shell.execute_reply.started": "2024-04-09T18:04:16.878024Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [00:44<00:00,  4.50it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 44.4612, 'train_samples_per_second': 4.498, 'train_steps_per_second': 4.498, 'train_loss': 0.001674874871969223, 'epoch': 200.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [01:00<00:00,  3.30it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 60.5495, 'train_samples_per_second': 6.606, 'train_steps_per_second': 3.303, 'train_loss': 0.00654977798461914, 'epoch': 200.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [01:40<00:00,  1.99it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 100.271, 'train_samples_per_second': 15.957, 'train_steps_per_second': 1.995, 'train_loss': 0.005510185360908509, 'epoch': 200.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [01:39<00:00,  2.02it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 99.1448, 'train_samples_per_second': 10.086, 'train_steps_per_second': 2.017, 'train_loss': 0.008187416791915893, 'epoch': 200.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [02:13<00:00,  1.50it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 133.5168, 'train_samples_per_second': 17.975, 'train_steps_per_second': 1.498, 'train_loss': 0.013275017738342285, 'epoch': 200.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from transformers import Trainer, TrainingArguments\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./finetuned\", # The output directory\n",
    "    overwrite_output_dir=True, # Overwrite the content of the output dir\n",
    "    num_train_epochs=200, # number of training epochs\n",
    "    per_device_train_batch_size=32, # batch size for training\n",
    "    per_device_eval_batch_size=32,  # batch size for evaluation\n",
    "    warmup_steps=10, # number of warmup steps for learning rate scheduler\n",
    "    gradient_accumulation_steps=16, # to make \"virtual\" batch size larger\n",
    "    fp16=True,\n",
    "    )\n",
    "\n",
    "for i in range(5): #len(file_paths)): # ЛУЧШЕ ПОСТАВИТЬ МЕНЬШЕ ИТЕРАЦИЙ\n",
    "    trainer = Trainer(\n",
    "        model=model,\n",
    "        args=training_args,\n",
    "        data_collator=data_collator,\n",
    "        train_dataset=train_dataset[i],\n",
    "        optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5), None)\n",
    "    )\n",
    "    trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b1a35696-42ab-4393-9c38-e4fe69dbafd2",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T18:11:42.456023Z",
     "iopub.status.busy": "2024-04-09T18:11:42.455193Z",
     "iopub.status.idle": "2024-04-09T18:14:44.971764Z",
     "shell.execute_reply": "2024-04-09T18:14:44.970585Z",
     "shell.execute_reply.started": "2024-04-09T18:11:42.455969Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import pickle\n",
    "file_path = \"model_rugpt3large_simple_full_text.pkl\"\n",
    "with open(file_path, 'wb') as f:\n",
    "    pickle.dump(model, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "b5087073",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T18:11:36.670993Z",
     "iopub.status.busy": "2024-04-09T18:11:36.670214Z",
     "iopub.status.idle": "2024-04-09T18:11:42.453770Z",
     "shell.execute_reply": "2024-04-09T18:11:42.452666Z",
     "shell.execute_reply.started": "2024-04-09T18:11:36.670960Z"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Зафайнтюнил я модельку на минутку…\n",
      "\n",
      "Что ж: довольно, довольно!\n",
      "Ты пировать пришел, Осип!\n",
      "Итак, ты доволен?\n",
      "Позволь обнять себя, мой милый.\n",
      "Целуй меня, мой милый, целуй.\n",
      "\n",
      "А когда уставится тебе в очи\n",
      "Свой великолепный, умный взор,\n",
      "Полюби безумный, веселый нрав\n",
      "И слабость жалкую души моей.\n",
      "\n",
      "Тогда, весь дрожа, припадая к ногам\n",
      "И страстно желая любви живой,\n",
      "Отказываться будешь мысленно\n",
      "От сладкой жизни с кукольной душой.\n",
      "\n",
      "Но скромной\n"
     ]
    }
   ],
   "source": [
    "text = \"Зафайнтюнил я модель\"\n",
    "input_ids = tokenizer.encode(text, return_tensors=\"pt\").to(DEVICE)\n",
    "\n",
    "\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    out = model.generate(input_ids,\n",
    "                        do_sample=True,\n",
    "                        num_beams=2,\n",
    "                        temperature=3.5,\n",
    "                        top_p=0.9,\n",
    "                        max_length=128,\n",
    "                        )\n",
    "\n",
    "generated_text = list(map(tokenizer.decode, out))[0]\n",
    "\n",
    "print(generated_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "135f64d3-475e-4fbe-bc95-f8a02ea0c518",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T18:15:01.002099Z",
     "iopub.status.busy": "2024-04-09T18:15:01.001101Z",
     "iopub.status.idle": "2024-04-09T18:15:06.503924Z",
     "shell.execute_reply": "2024-04-09T18:15:06.502766Z",
     "shell.execute_reply.started": "2024-04-09T18:15:01.002046Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "from transformers import GPT2LMHeadModel, GPT2Tokenizer\n",
    "import torch\n",
    "DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "\n",
    "model_name_or_path = \"sberbank-ai/rugpt3large_based_on_gpt2\"\n",
    "tokenizer = GPT2Tokenizer.from_pretrained(model_name_or_path)\n",
    "model = GPT2LMHeadModel.from_pretrained(model_name_or_path).to(DEVICE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a43234e4-a465-4b03-8c76-815997f00847",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2024-04-09T18:15:25.821002Z",
     "iopub.status.busy": "2024-04-09T18:15:25.819798Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [00:46<00:00,  4.33it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 46.1863, 'train_samples_per_second': 4.33, 'train_steps_per_second': 4.33, 'train_loss': 0.018622456789016723, 'epoch': 200.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [01:05<00:00,  3.06it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 65.3796, 'train_samples_per_second': 6.118, 'train_steps_per_second': 3.059, 'train_loss': 0.021470596790313722, 'epoch': 200.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [01:44<00:00,  1.91it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 104.633, 'train_samples_per_second': 15.292, 'train_steps_per_second': 1.911, 'train_loss': 0.04938124179840088, 'epoch': 200.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 200/200 [01:38<00:00,  2.03it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 98.5538, 'train_samples_per_second': 10.147, 'train_steps_per_second': 2.029, 'train_loss': 0.03502564668655395, 'epoch': 200.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      " 73%|███████▎  | 146/200 [01:36<00:38,  1.41it/s]"
     ]
    }
   ],
   "source": [
    "from transformers import Trainer, TrainingArguments\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    output_dir=\"./finetuned\", # The output directory\n",
    "    overwrite_output_dir=True, # Overwrite the content of the output dir\n",
    "    num_train_epochs=200, # number of training epochs\n",
    "    per_device_train_batch_size=32, # batch size for training\n",
    "    per_device_eval_batch_size=32,  # batch size for evaluation\n",
    "    warmup_steps=10, # number of warmup steps for learning rate scheduler\n",
    "    gradient_accumulation_steps=16, # to make \"virtual\" batch size larger\n",
    "    fp16=True,\n",
    "    )\n",
    "\n",
    "for i in range(5): #len(file_paths)): # ЛУЧШЕ ПОСТАВИТЬ МЕНЬШЕ ИТЕРАЦИЙ\n",
    "    trainer = Trainer(\n",
    "        model=model,\n",
    "        args=training_args,\n",
    "        data_collator=data_collator,\n",
    "        train_dataset=train_dataset[i],\n",
    "        optimizers = (torch.optim.AdamW(model.parameters(),lr=1e-5), None)\n",
    "    )\n",
    "    trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "666a0f5f-54a6-4e85-894b-607d3cd74659",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "file_path = \"model_rugpt3large_full_text.pkl\"\n",
    "with open(file_path, 'wb') as f:\n",
    "    pickle.dump(model, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "42ac4b04-08d1-4184-a46c-3f442287de5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"Зафайнтюнил я модель\"\n",
    "input_ids = tokenizer.encode(text, return_tensors=\"pt\").to(DEVICE)\n",
    "\n",
    "\n",
    "model.eval()\n",
    "with torch.no_grad():\n",
    "    out = model.generate(input_ids,\n",
    "                        do_sample=True,\n",
    "                        num_beams=2,\n",
    "                        temperature=3.5,\n",
    "                        top_p=0.9,\n",
    "                        max_length=128,\n",
    "                        )\n",
    "\n",
    "generated_text = list(map(tokenizer.decode, out))[0]\n",
    "\n",
    "print(generated_text)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "DataSphere Kernel",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}