{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "krQK4P9tWs_F" }, "source": [ "# 🧠 GPT-2 124M Fine-tuned on Wikitext\n", "\n", "This is an experiment aiming to achieve practical understanding of learning 🤗 Transformers and 🤗 Datasets. To follow the guide outlined [Hugging Face Notebooks](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)." ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "9-IVR9eMWY8v", "outputId": "a2ebf62e-55b2-488e-95c8-b1ee10026308" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting datasets\n", " Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)\n", "Collecting transformers\n", " Downloading transformers-4.35.2-py3-none-any.whl.metadata (123 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m123.5/123.5 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", "\u001b[?25hCollecting accelerate\n", " Downloading accelerate-0.25.0-py3-none-any.whl.metadata (18 kB)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from datasets) (1.24.1)\n", "Collecting pyarrow>=8.0.0 (from datasets)\n", " Downloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)\n", "Collecting pyarrow-hotfix (from datasets)\n", " Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n", "Collecting dill<0.3.8,>=0.3.0 (from datasets)\n", " Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)\n", "Collecting pandas (from datasets)\n", " Downloading pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets) (2.31.0)\n", "Collecting tqdm>=4.62.1 (from datasets)\n", " Downloading tqdm-4.66.1-py3-none-any.whl.metadata (57 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m17.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting xxhash (from datasets)\n", " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n", "Collecting multiprocess (from datasets)\n", " Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)\n", "Requirement already satisfied: fsspec<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets) (2023.4.0)\n", "Collecting aiohttp (from datasets)\n", " Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)\n", "Collecting huggingface-hub>=0.18.0 (from datasets)\n", " Downloading huggingface_hub-0.19.4-py3-none-any.whl.metadata (14 kB)\n", "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from datasets) (23.2)\n", "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets) (6.0.1)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers) (3.9.0)\n", "Collecting regex!=2019.12.17 (from transformers)\n", " Downloading regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m2.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting tokenizers<0.19,>=0.14 (from transformers)\n", " Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n", "Collecting safetensors>=0.3.1 (from transformers)\n", " Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate) (5.9.6)\n", "Requirement already satisfied: torch>=1.10.0 in /usr/local/lib/python3.10/dist-packages (from accelerate) (2.1.0+cu118)\n", "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets) (23.1.0)\n", "Collecting multidict<7.0,>=4.5 (from aiohttp->datasets)\n", " Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m15.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hCollecting yarl<2.0,>=1.0 (from aiohttp->datasets)\n", " Downloading yarl-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (28 kB)\n", "Collecting frozenlist>=1.1.1 (from aiohttp->datasets)\n", " Downloading frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)\n", "Collecting aiosignal>=1.1.2 (from aiohttp->datasets)\n", " Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n", "Collecting async-timeout<5.0,>=4.0 (from aiohttp->datasets)\n", " Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)\n", "INFO: pip is looking at multiple versions of huggingface-hub to determine which version is compatible with other requirements. This could take a while.\n", "Collecting huggingface-hub>=0.18.0 (from datasets)\n", " Downloading huggingface_hub-0.19.3-py3-none-any.whl.metadata (14 kB)\n", " Downloading huggingface_hub-0.19.2-py3-none-any.whl.metadata (13 kB)\n", " Downloading huggingface_hub-0.19.1-py3-none-any.whl.metadata (13 kB)\n", " Downloading huggingface_hub-0.19.0-py3-none-any.whl.metadata (13 kB)\n", " Downloading huggingface_hub-0.18.0-py3-none-any.whl.metadata (13 kB)\n", "Collecting fsspec[http]<=2023.10.0,>=2023.1.0 (from datasets)\n", " Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)\n", "Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.10/dist-packages (from huggingface-hub>=0.18.0->datasets) (4.4.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2.1.1)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (3.4)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (1.26.13)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets) (2022.12.7)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (1.12)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.0)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (3.1.2)\n", "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.10.0->accelerate) (2.1.0)\n", "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets) (2.8.2)\n", "Collecting pytz>=2020.1 (from pandas->datasets)\n", " Downloading pytz-2023.3.post1-py2.py3-none-any.whl.metadata (22 kB)\n", "Collecting tzdata>=2022.1 (from pandas->datasets)\n", " Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/lib/python3/dist-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.10.0->accelerate) (2.1.2)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.10.0->accelerate) (1.3.0)\n", "Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m56.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading transformers-4.35.2-py3-none-any.whl (7.9 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.9/7.9 MB\u001b[0m \u001b[31m98.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n", "\u001b[?25hDownloading accelerate-0.25.0-py3-none-any.whl (265 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m265.7/265.7 kB\u001b[0m \u001b[31m72.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading dill-0.3.7-py3-none-any.whl (115 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m34.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading fsspec-2023.10.0-py3-none-any.whl (166 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.4/166.4 kB\u001b[0m \u001b[31m46.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.7/311.7 kB\u001b[0m \u001b[31m80.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m73.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pyarrow-14.0.1-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.0/38.0 MB\u001b[0m \u001b[31m94.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n", "\u001b[?25hDownloading regex-2023.10.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m773.9/773.9 kB\u001b[0m \u001b[31m188.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m239.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m123.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading tqdm-4.66.1-py3-none-any.whl (78 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.3/78.3 kB\u001b[0m \u001b[31m26.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pandas-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m111.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", "\u001b[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", "Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m75.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n", "Downloading frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (225 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m225.7/225.7 kB\u001b[0m \u001b[31m61.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading pytz-2023.3.post1-py2.py3-none-any.whl (502 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m502.5/502.5 kB\u001b[0m \u001b[31m108.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hDownloading yarl-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (300 kB)\n", "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m300.7/300.7 kB\u001b[0m \u001b[31m79.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001b[?25hInstalling collected packages: pytz, xxhash, tzdata, tqdm, safetensors, regex, pyarrow-hotfix, pyarrow, multidict, fsspec, frozenlist, dill, async-timeout, yarl, pandas, multiprocess, huggingface-hub, aiosignal, tokenizers, aiohttp, accelerate, transformers, datasets\n", " Attempting uninstall: fsspec\n", " Found existing installation: fsspec 2023.4.0\n", " Uninstalling fsspec-2023.4.0:\n", " Successfully uninstalled fsspec-2023.4.0\n", "Successfully installed accelerate-0.25.0 aiohttp-3.9.1 aiosignal-1.3.1 async-timeout-4.0.3 datasets-2.15.0 dill-0.3.7 frozenlist-1.4.0 fsspec-2023.10.0 huggingface-hub-0.19.4 multidict-6.0.4 multiprocess-0.70.15 pandas-2.1.3 pyarrow-14.0.1 pyarrow-hotfix-0.6 pytz-2023.3.post1 regex-2023.10.3 safetensors-0.4.1 tokenizers-0.15.0 tqdm-4.66.1 transformers-4.35.2 tzdata-2023.3 xxhash-3.4.1 yarl-1.9.3\n", "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", "\u001b[0m" ] } ], "source": [ "# Install neccessary packages\n", "!pip install datasets transformers accelerate" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 331, "referenced_widgets": [ "72d3f33c56a14b01bef05ea2ae98e72f", "8d44cf16b34d40bd8702560ae149192c", "13af4957747d40bbb9501bc13c3c160b", "1be0b602c74f4a7182bd041aeba58112", "af225dc2d79649c2ac0e853c517a0482", "f019d8eee73f4b7b8312029f03040c45", "d1deb908be244cfb8dd253b3cc081510", "06e659f7bd9149e784a5cb68efbad736", "ba1f1326e12d437a9fd76faa226eec44", "65ea346ce3174bb098344531db439eac", "6ac7576be7204aeca73385c48e1c5d0a", "7fd3c4de83234008a72d9541c29ce765", "b81585732ca14c89ae1b5bb25735cd62", "a68a4415305045c0a89c60349175cc82", "7316ece4ad474193bcc5b5db55632561", "c0bb9ace36d64b8296bd35d538bd4f7e", "03ad39c22f644e55bcffa070e3832582" ] }, "id": "lpuAz9I-XsQQ", "outputId": "bf420c42-7ff0-4a4c-da57-420691eaec5d" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e61086bda2ca40b5a189544f6a8b43a5", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "
---|---|---|
1 | \n", "3.133500 | \n", "3.036319 | \n", "
2 | \n", "3.064300 | \n", "2.996815 | \n", "
3 | \n", "3.038400 | \n", "2.984082 | \n", "
"
],
"text/plain": [
"
Copy a token from your Hugging Face\ntokens page and paste it below.
Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file.